Data insertion fails for bigquery using Python - python

I'm using bigquery client by tyler treat (https://github.com/tylertreat/BigQuery-Python) through python, the code compiles without error and returns true for table exists, but data insertion fails. Please let me know if something is wrong in the following code.
from oauth2client.client import flow_from_clientsecrets
from bigquery import get_client
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import sys, os
json_key = 'key.json'
client = get_client(json_key_file=json_key, readonly=True)
exists = client.check_table('Ucare', 'try')
schema = [
{'name' : 'time', 'type': 'STRING', 'mode': 'nullable'}]
created = client.create_table('Ucare', 'try', schema)
print created
print exists
rows = [('time':'ipvbs6k16sp6bkut')]
#rows = { 'rows':[{'json':{'event_type':'_session.stop'},'insertId' : 0}]}
inserted = client.push_rows('Ucare', 'try', rows,'24556135')

Related

AttributeError: module 'config' has no attribute 'TWITTER_ACCESS_TOKEN_SECRET'. Did you mean: 'TWITTER_ACCESS_TOKEN'?

import streamlit as st
import pandas as pd
import numpy as np
import requests
import tweepy
import config
import psycopg2
import psycopg2.extras
import plotly.graph_objects as go
auth = tweepy.OAuthHandler(config.TWITTER_CONSUMER_KEY,
config.TWITTER_CONSUMER_SECRET)
auth.set_access_token(config.TWITTER_ACCESS_TOKEN,
config.TWITTER_ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
Problem with my code, it is not running with the twitter key. The module has no attributes
The config module that you are attempting to import and read off of is not what you want.
TWITTER_CONSUMER_KEY and TWITTER_CONSUMER_SECRET are not constants that you can get from a module. These are values that you must input yourself. There is perhaps a piece of code missing at the start of your application that looks like this:
config = {
'TWITTER_CONSUMER_KEY': 'ENTER YOUR TWITTER CONSUMER KEY',
'TWITTER_CONSUMER_SECRET': 'ENTER YOUR TWITTER CONSUMER SECRET'
}
Take a look at this article for more help. Goodluck!

API error encountered when trying to use matplotlib with gspread

I'm trying to use matplotlib to plot graphs using data from a google sheet. I read the documentation and enabled Google Drive & Google Sheets API but I get an error:
APIError: {'code': 400, 'message': 'This operation is not supported for this document', 'status': 'FAILED_PRECONDITION'}
On a side note, I'm quite new to data analytics, so given a table with a column that has only '0' or '1', how can I split the dataframe and plot those rows of data with '0' or '1'?
import gspread
import pandas as pd
from google.colab import auth
from google.auth import default
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
# sh = gc.open_by_url("")
sh = gc.open_by_key("")
worksheet = sh.worksheet("user06")
rows = worksheet.get_all_values()
pd.DataFrame.from_records(rows)

Google cloud platform gcp delete snapshot 10 days older using python

I want to delete the snapshot which is 10 days older in GCP using python. I tried using the below program using filter expression, but unfortunately i faced below errors
from datetime import datetime
from googleapiclient import discovery
import google.oauth2.credentials
from oauth2client.service_account import ServiceAccountCredentials
import sys
def get_disks(project,zone):
credentials = ServiceAccountCredentials.from_json_keyfile_name(r"D:\Users\ganeshb\Desktop\Json\auth.json",
scopes='https://www.googleapis.com/auth/compute')
service = discovery.build('compute', 'v1',credentials=credentials)
request = service.snapshots().list(project='xxxx',FILTER="creationTimestamp<'2021-05-31'")
response = request.execute()
print (response)
output = get_disks("xxxxxxxx", "europe-west1-b")
Your problem is a known Google Cloud bug.
Please read these issue trackers: 132365111 and 132676194
Solution:
Remove the filter statement and process the returned results:
from datetime import datetime
from dateutil import parser
request = service.snapshots().list(project=project)
response = request.execute()
# Watch for timezone issues here!
filter_date = '2021-05-31'
d1 = parser.parse(filter_date)
for item in response['items']:
d2 = datetime.fromisoformat(item['creationTimestamp'])
if d2.timestamp() < d1.timestamp():
# Process the result here. This is a print statement stub.
print("{} {}".format(item['name'], item['creationTimestamp']))

Error loading Glue ETL job into snowflake

I am trying to load data from s3 buckets csv files into snowflake using glue ETL. Wrote a python script within the ETL job for the same as below:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from py4j.java_gateway import java_import
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
## #params: [JOB_NAME, URL, ACCOUNT, WAREHOUSE, DB, SCHEMA, USERNAME, PASSWORD]
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'URL', 'ACCOUNT', 'WAREHOUSE', 'DB', 'SCHEMA',
'USERNAME', 'PASSWORD'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
java_import(spark._jvm, "net.snowflake.spark.snowflake")
spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession
(spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
sfOptions = {
"sfURL" : args['URL'],
"sfAccount" : args['ACCOUNT'],
"sfUser" : args['USERNAME'],
"sfPassword" : args['PASSWORD'],
"sfDatabase" : args['DB'],
"sfSchema" : args['SCHEMA'],
"sfWarehouse" : args['WAREHOUSE'],
}
dyf = glueContext.create_dynamic_frame.from_catalog(database = "salesforcedb", table_name =
"pr_summary_csv", transformation_ctx = "dyf")
df=dyf.toDF()
##df.write.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("parallelism",
"8").option("dbtable", "abcdef").mode("overwrite").save()
df.write.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("dbtable", "abcdef").save()
job.commit()
The error thrown is:
error occurred while calling o81.save. Incorrect username or password was specified.
However if I don't convert to Spark data frame, and use directly the dynamic frame I get error like this:
AttributeError: 'function' object has no attribute 'format'
Could someone please look over my code and tell me what I'm doing wrong for converting a dynamic frame to DF? Please let me know If I need to provide more information.
BTW , I am newbie to snowflake and this is my trial on loading data through AWS Glue. 😊
error occurred while calling o81.save. Incorrect username or password
was specified.
The error message says that there's an error about the user or the password. If you are sure that the user name and the password are correct, please be sure that the Snowflake account name and URL are also correct.
However if I don't convert to Spark data frame, and use directly the
dynamic frame I get error like this:
AttributeError: 'function' object has no attribute 'format'
The Glue DynamicFrame's write method is different than Spark DataFrame, so it's normal to not to have same methods. Please check the documentation:
https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-crawler-pyspark-extensions-dynamic-frame.html#aws-glue-api-crawler-pyspark-extensions-dynamic-frame-write
It seems you need to give the parameters as connection_options:
write(connection_type, connection_options, format, format_options, accumulator_size)
connection_options = {"url": "jdbc-url/database", "user": "username", "password": "password","dbtable": "table-name", "redshiftTmpDir": "s3-tempdir-path"}
Even you use the DynamicFrame, you will probably end up with the incorrect username or password error. So I suggest you to focus on fixing the credentials.
Here is the tested Glue Code (you can copy paste as it is only change the table name ), which you can use for setting up Glue ETL .
You will have to add the JDBC and Spark jars .You can use the below link for set up:
https://community.snowflake.com/s/article/How-To-Use-AWS-Glue-With-Snowflake
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from py4j.java_gateway import java_import
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake";
## #params: [JOB_NAME, URL, ACCOUNT, WAREHOUSE, DB, SCHEMA, USERNAME, PASSWORD]
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'URL', 'ACCOUNT', 'WAREHOUSE', 'DB', 'SCHEMA', 'USERNAME', 'PASSWORD'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## uj = sc._jvm.net.snowflake.spark.snowflake
spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession(spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
sfOptions = {
"sfURL" : args['URL'],
"sfAccount" : args['ACCOUNT'],
"sfUser" : args['USERNAME'],
"sfPassword" : args['PASSWORD'],
"sfDatabase" : args['DB'],
"sfSchema" : args['SCHEMA'],
"sfWarehouse" : args['WAREHOUSE'],
}
## Read from a Snowflake table into a Spark Data Frame
df = spark.read.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("query", "Select * from <tablename>").load()
df.show()
## Perform any kind of transformations on your data and save as a new Data Frame: df1 = df.[Insert any filter, transformation, or other operation]
## Write the Data Frame contents back to Snowflake in a new table df1.write.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("dbtable", "[new_table_name]").mode("overwrite").save() job.commit()

How to upload a local CSV to google big query using python

I'm trying to upload a local CSV to google big query using python
def uploadCsvToGbq(self,table_name):
load_config = {
'destinationTable': {
'projectId': self.project_id,
'datasetId': self.dataset_id,
'tableId': table_name
}
}
load_config['schema'] = {
'fields': [
{'name':'full_name', 'type':'STRING'},
{'name':'age', 'type':'INTEGER'},
]
}
load_config['sourceFormat'] = 'CSV'
upload = MediaFileUpload('sample.csv',
mimetype='application/octet-stream',
# This enables resumable uploads.
resumable=True)
start = time.time()
job_id = 'job_%d' % start
# Create the job.
result = bigquery.jobs.insert(
projectId=self.project_id,
body={
'jobReference': {
'jobId': job_id
},
'configuration': {
'load': load_config
}
},
media_body=upload).execute()
return result
when I run this it throws error like
"NameError: global name 'MediaFileUpload' is not defined"
whether any module is needed please help.
One of easiest method to upload to csv file in GBQ is through pandas.Just import csv file to pandas (pd.read_csv()). Then from pandas to GBQ (df.to_gbq(full_table_id, project_id=project_id)).
import pandas as pd
import csv
df=pd.read_csv('/..localpath/filename.csv')
df.to_gbq(full_table_id, project_id=project_id)
Or you can use client api
from google.cloud import bigquery
import pandas as pd
df=pd.read_csv('/..localpath/filename.csv')
client = bigquery.Client()
dataset_ref = client.dataset('my_dataset')
table_ref = dataset_ref.table('new_table')
client.load_table_from_dataframe(df, table_ref).result()
pip install --upgrade google-api-python-client
Then on top of your python file write:
from googleapiclient.http import MediaFileUpload
But care you miss some parenthesis. Better write:
result = bigquery.jobs().insert(projectId=PROJECT_ID, body={'jobReference': {'jobId': job_id},'configuration': {'load': load_config}}, media_body=upload).execute(num_retries=5)
And by the way, you are going to upload all your CSV rows, including the top one that defines columns.
The class MediaFileUpload is in http.py. See https://google-api-python-client.googlecode.com/hg/docs/epy/apiclient.http.MediaFileUpload-class.html

Categories