Big Query: Create table with time partitioning and clustering fields using Python - python

I can successfully create a Big Query table in Python as:
from google.cloud import bigquery
bq_client = bigquery.Client()
table_name = "my_test_table"
dataset = bq_client.dataset("MY_TEST_DATASET")
table_ref = dataset.table(table_name)
table = bigquery.Table(table_ref)
table = bq_client.create_table(table)
And later I upload a local Pandas DataFrame as:
# --- Define BQ options ---
job_config = bigquery.LoadJobConfig()
job_config.write_disposition = "WRITE_APPEND"
job_config.source_format = bigquery.SourceFormat.CSV
# --- Load data ---
job = bq_client.load_table_from_dataframe(
df, f"MY_TEST_DATASET.{table_name}", job_config=job_config
)
How can I specify, while creating the table and using Python:
Partition by daily ingestion time
As clustering fields ["business_id", "software_house", "product_id"]

You can use the following Python script to create your BigQuery table with partitioning and clustering :
def create_table():
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
table_id = "your_project.your_dataset.table_test"
schema = [
bigquery.SchemaField("business_id", "STRING", mode="NULLABLE"),
bigquery.SchemaField("software_house", "STRING", mode="NULLABLE"),
bigquery.SchemaField("product_id", "STRING", mode="NULLABLE"),
bigquery.SchemaField("other_field", "STRING", mode="NULLABLE"),
bigquery.SchemaField("ingestion_time", "TIMESTAMP", mode="NULLABLE"),
]
table = bigquery.Table(table_id, schema=schema)
# Clustering.
table.clustering_fields = ["business_id", "software_house", "product_id"]
# Partitioning.
table.time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
field="ingestion_time", # name of column to use for partitioning
expiration_ms=7776000000
) # 90 days
table = client.create_table(table)
print(
"Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)
if __name__ == '__main__':
create_table()
In this case :
A partitoning was added on ingestion_time field per DAY
A clustering wad added on ["business_id", "software_house", "product_id"] fields
These docs show how adding partitioning and clustering on fields :
BQ table partitioning
BQ table clustering
The result in BigQuery is :

Related

How to Bulk insert data into MSSQL database in a AWS Glue python shell job?

I have large sets of data in s3. In my Python glue job, I will be extracting data from those files in the form of a pandas data frame and apply necessary transformations on the data frame and then load it into Microsoft SQL database using PYMSSQL library. The final data frame contains an average of 100-200K rows and 180 columns of data. Currently I am using PYMSSQL to connect to the database. The problem is executemany of the cursor class takes too much to load the data. Approximately 20 Min for 100k rows. I checked the logs and it was always the loading which is slow. screenshot attached. How to load them faster? I am attaching my code here:
file=s3.get_object(Bucket=S3_BUCKET_NAME,Key=each_file)
for chunk in pd.read_csv(file['Body'],sep=",",header=None,low_memory=False,chunksize=100000):
all_data.append(chunk)
data_frame = pd.concat(all_data, axis= 0)
all_data.clear()
cols = data_frame.select_dtypes(object).columns
data_frame[cols] = data_frame[cols].apply(lambda x: x.str.strip())
data_frame.replace(to_replace ='',value =np.nan,inplace=True)
data_frame.fillna(value=np.nan, inplace=True)
data_frame.insert(0,'New-column', 1111)
sql_data_array =data_frame.replace({np.nan:None}).to_numpy()
sql_data_tuple=tuple(map(tuple, sql_data_array))
try:
sql="insert into [db].[schema].[table](column_names)values(%d,%s,%s,%s,%s,%s...)"
db_cursor.executemany(sql,sql_data_tuple)
print("loading completed on {}".format(datetime.datetime.now()))
except Exception as e:
print(e)
I ended up doing this and gave me much better results(1 Million in 11 Min):
(Use Glue 2.0 python job instead of python shell job)
Extracted the data from s3
Transformed it using Pandas
Uploaded the transformed file as a CSV to s3.
Created a dynamic frame from a catalog table that was created using a crawler by crawling the transformed CSV file. Or You can create dynamic frame directly using Options.
Synchronize the dynamic frame to the catalog table that was created using a crawler by crawling the Destination MSSQL table.
csv_buffer = StringIO()
s3_resource = boto3.resource("s3", region_name=AWS_REGION)
file=s3.get_object(Bucket=S3_BUCKET_NAME,Key=each_file)
for chunk in pd.read_csv(file['Body'],sep=",",header=None,low_memory=False,chunksize=100000):
all_data.append(chunk)
data_frame = pd.concat(all_data, axis= 0)
all_data.clear()
cols = data_frame.select_dtypes(object).columns
data_frame[cols] = data_frame[cols].apply(lambda x: x.str.strip())
data_frame.replace(to_replace ='',value =np.nan,inplace=True)
data_frame.fillna(value=np.nan, inplace=True)
data_frame.insert(0,'New-column', 1234)
data_frame.to_csv(csv_buffer)
result=s3_resource.Object(S3_BUCKET_NAME, 'path in s3').put(Body=csv_buffer.getvalue())
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "source db name", table_name = "source table name", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [mappings], transformation_ctx = "applymapping1")
selectfields2 = SelectFields.apply(frame = applymapping1, paths = [column names of destination catalog table], transformation_ctx = "selectfields2")
resolvechoice3 = ResolveChoice.apply(frame = selectfields2, choice = "MATCH_CATALOG", database = "destination dbname", table_name = "destination table name", transformation_ctx = "resolvechoice3")
resolvechoice4 = ResolveChoice.apply(frame = resolvechoice3, choice = "make_cols", transformation_ctx = "resolvechoice4")
datasink5 = glueContext.write_dynamic_frame.from_catalog(frame = resolvechoice4, database = "destination db name", table_name = "destination table name", transformation_ctx = "datasink5")
job.commit()

BigQuery create external tables from Parquet files with autodetect schema using Python

I'm not able to find any example to create external tables from Paquet files with autodetect schema. Here is my current code :
bq_client = bigquery.Client.from_service_account_json(key_path)
table_name = "my_table"
table_id = f"{PROJECT_ID}.{DATASET}.{table_name}"
dataset_ref = bq_client.dataset(DATASET)
table_ref = bigquery.TableReference(dataset_ref, table_id)
table_schema = [bigquery.schema.SchemaField("example","STRING")] # I don't want this
table = bigquery.Table(table_ref, table_schema) # I don't want this
external_config = bigquery.ExternalConfig(source_format='PARQUET')
source_uris = [f"gs://path/to/file_name.snappy.parquet"]
external_config.source_uris = source_uris
external_config.autodetect = True
table.external_data_configuration = external_config # Not sure how to do this
bq_client.create_table(table) # and this without table schema
logger.debug("Created table '{}'.".format(table_id))
Currently I have to specify the table schema. I want to autodetect the schema instead. Kindly help. Thank you.
Check out the documentation https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet#loading_parquet_data_into_a_new_table
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"
job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.PARQUET,)
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
load_job = client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Error while trying to append data to a BigQuery table using pandas data frame

I have a pandas data frame that looks like this:
It has 6 columns in it. I tried appending it to an existing table in BigQuery with the same schema with this:
import os
from google.cloud import bigquery
# Login credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="secret.json"
# Initialize big query
client = bigquery.Client()
# Table information
project = "xxxxxxxx"
dataset = "Vahan"
table = "rto_data"
table_id = '{}.{}.{}'.format(project, dataset, table)
# Setup for upload
job_config = bigquery.LoadJobConfig()
# Define the table schema
schema = [bigquery.SchemaField(name='State', field_type='STRING', mode='NULLABLE'),
bigquery.SchemaField(name='RTO', field_type='STRING', mode='NULLABLE'),
bigquery.SchemaField(name='Registration_Number', field_type='STRING', mode='NULLABLE'),
bigquery.SchemaField(name='Maker', field_type='STRING', mode='NULLABLE'),
bigquery.SchemaField(name='Date', field_type='DATE', mode='NULLABLE'),
bigquery.SchemaField(name='Registrations', field_type='INTEGER', mode='NULLABLE')]
job_config.create_disposition = "CREATE_IF_NEEDED"
# Make the API request
load_result = client.load_table_from_dataframe(dataframe=df,
destination=table_id,
job_config=job_config)
# Wait for query to finish working
load_result.result()
# Make an API request.
table = client.get_table(table_id)
# Output
print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id))
and I'm getting this error: BadRequest: 400 Provided Schema does not match Table advanced-analytics-123456:Vahan.rto_data. Cannot add fields (field: __index_level_0__)
I put the data in a new table and looks like the query is adding a random new column called __index_level_0__
How do I fix this so that I can append the data to my existing table? Your help would be greatly appreciated!
Maybe you have a __index_level_0__ column in the dataframe?
Try dropping the index:
df.reset_index(drop=True, inplace=True)

BigQuery: How to set `require_partition_filter` by TableReference

I am going to set require_partition_filter to True on bigquery Table. But I can access only TableReference instead of Table. How to do this?
UPDATE
Maybe I did not express my question clearly. I need to write python program to do this. I would not like to set the configuration by commands or sql, because there are too many tables. In this program, I can generate TableReference table_ref in the following program. But how to set require_parition_filter on table_ref?
def table(client, dataset_name, table_name):
dataset = client.dataset(dataset_name)
table_ref = dataset.table(table_name)
return table_ref
job = client.load_table_from_uri(
glob, # google cloud storage bucket
table_ref, # returned by table() function above
job_id='123',
job_config=config, # at beginning, set `require_parition_filter` here, but this deprecated
)
How to do this?
As mentioned in this answer you can use an ALTER command to alter your table as follow:
#standardSQL
ALTER TABLE IF EXISTS mydataset.newtable
SET OPTIONS(
require_partition_filter = false
)
You can change the require_partition_filter back to true if needed using the same command
Note that the preferred way would be to get a bigquery.Table object from TableReference, change require_partition_filter and patch the table (similar example with expiration date). However, as you mention that you can only access TableReference, you can also set
TimePartitioning.require_partition_filter (deprecated according to docs) when creating the table with a load job. For example:
job_config = bigquery.LoadJobConfig(
schema = [
bigquery.SchemaField("foo", "STRING"),
],
time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
require_partition_filter = True
)
)
and it works as expected:
$ bq query "SELECT * FROM test.require_filter"
Waiting on JOB_ID ... (0s) Current status: DONE
Error in query string: Error processing job 'JOB_ID': Cannot query over table 'test.require_filter' without a filter over column(s) '_PARTITION_LOAD_TIME',
'_PARTITIONDATE', '_PARTITIONTIME' that can be used for partition elimination
Full code:
import pandas as pd
from google.cloud import bigquery
PROJECT = "PROJECT_ID"
DATASET = "test"
TABLE = "require_filter"
def table(client, dataset_name, table_name):
dataset = client.dataset(dataset_name)
table_ref = dataset.table(table_name)
return table_ref
client = bigquery.Client(project=PROJECT)
job_config = bigquery.LoadJobConfig(
schema = [
bigquery.SchemaField("foo", "STRING"),
],
time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
require_partition_filter = True
)
)
data = {"foo": ["bar"]}
df = pd.DataFrame(data)
table_ref = table(client, DATASET, TABLE)
load_job = client.load_table_from_dataframe(
df,
table_ref,
job_config = job_config
)
result = load_job.result()

create a table and query the value of this table using Python API in Bigquery

I want to create a new table and assign the value of this table by writing a query.
from google.cloud import bigquery
bigquery_client = bigquery.Client(project="myproject")
dataset = bigquery_client.dataset("mydataset")
table_ref = dataset.table('New table')
table = bigquery.Table(table_ref)
table = client.create_table(table)
So, I created an empty table
And now I wish to have some values for this table from another table that's called "Old Table":
query = "SELECT * FROM `{Old table}`"
How can I make sure that my table is linked to this query?
I tried
table.view_query = "SELECT * FROM `{Old table}`"
but it didn't work
Thanks,
I think you should use smth like this:
bigquery_client = bigquery.Client(project="myproject")
dataset = bigquery_client.dataset("mydataset")
table_ref = dataset.table('New_table')
sql_query = "SELECT * FROM `{project}.{dataset}.{table}`"
job_config = bigquery.QueryJobConfig()
# Set configuration.query.destinationTable
job_config.destination = table_ref
# Set configuration.query.createDisposition
job_config.create_disposition = 'CREATE_IF_NEEDED'
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_APPEND'
# Start the query
job = bigquery_client.query(sql_query, job_config=job_config)
# Wait for the query to finish
job.result()

Categories