Store and retrieve pickled python objects to/from snowflake - python
as per question, I am trying to store picked python objects to snowflake, to get them back again at a later date. Help on this would be much appreciated:
Snowflake table definition:
CREATE OR REPLACE TABLE <db>.<schema>.TESTING_MEMORY (
MODEL_DATETIME DATETIME,
SCALARS VARIANT
;
Python code:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
from sklearn.preprocessing import StandardScaler
def create_snowflake_connection():
conn = snowflake.connector.connect(
user='<username>',
account='<account>',
password = '<password>',
warehouse='<wh>',
database='<db>',
role='<role>',
schema='<schema>'
)
return conn
memory = {}
np.random.seed(78)
df = pd.DataFrame({
'x1': np.random.normal(0, 2, 10000),
'x2': np.random.normal(5, 3, 10000),
'x3': np.random.normal(-5, 5, 10000)
})
scaler = StandardScaler()
scaler.fit(df)
scaled_df = scaler.transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=['x1', 'x2', 'x3'])
memory['SCALARS'] = pickle.dumps(scaler)
ctx = create_snowflake_connection()
# Write to snowflake
db_dat = pd.DataFrame([list(memory.values())], columns=list(memory.keys()))
db_dat.insert(0, 'MODEL_DATETIME', datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"))
success, nchunks, nrows, _ = write_pandas(conn=ctx, df = db_dat, table_name = 'TESTING_MEMORY')
# retreive from snowflake
cur = ctx.cursor()
sql = """
SELECT hex_encode(SCALARS)
FROM <db>.<schema>.TESTING_MEMORY
QUALIFY ROW_NUMBER() OVER (ORDER BY MODEL_DATETIME DESC) = 1
"""
cur.execute(sql)
returned = cur.fetch_pandas_all()
cur.close()
ctx.close()
Seems like you're trying to put python byte object into a Snowflake variant which won't work for you.
This answer is kind of similar to what the other answer here suggests except, rather than using a varchar field to store base64 encoded binary, use a binary type instead. base64 encoding is around 30% larger than binary from what I've read somewhere.
Create the table with binary data type:
create or replace table testdb.public.test_table (obj binary);
Hex encode the pickled object, write it, read it back and call a method on it:
import pickle
import snowflake.connector
# This is the object we're going to store in Snowflake as binary
class PickleMe:
def __init__(self, first_name, last_name):
self.first_name = first_name
self.last_name = last_name
def say_hello(self):
print(f'Hi there, {self.first_name} {self.last_name}')
# Create the object and store it as hex in the 'hex_person' variable
person = PickleMe('John', 'Doe')
hex_person = pickle.dumps(person).hex()
with snowflake.connector.connect(
user="username",
password="password",
account="snowflake_account_deets",
warehouse="warehouse_name",
) as con:
# Write pickled object into table as binary
con.cursor().execute(f"INSERT INTO testdb.public.test_table values(to_binary('{hex_person}', 'HEX'))")
# Now get the object back and put it into the 'obj' variable
(obj,) = con.cursor().execute(f"select obj from testdb.public.test_table").fetchone()
# Deserialise object and call method on it
person_obj = pickle.loads(obj, encoding='HEX')
person_obj.say_hello()
The output of the above is
Hi there, John Doe
There is probably a better way to do this (disclaimer: I am new to Python), but this seems to work and is based off answer here: How can I pickle a python object into a csv file?
Change sql table defintion
CREATE OR REPLACE TABLE db.schema.TESTING_MEMORY (
MODEL_DATETIME DATETIME,
SCALARS VARCHAR
);
2 Changes to Python Code - general
import base64
3 Changes to Python code (write to snowflake section above)
# Write to snowflake
db_dat = pd.DataFrame([list(memory.values())], columns=list(memory.keys()))
db_dat.insert(0, 'MODEL_DATETIME', datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"))
pickled_columns = ['SCALARS']
for column in pickled_columns:
b64_bytes = base64.b64encode(db_dat[column].values[0])
db_dat[column] = b64_bytes.decode('utf8')
success, nchunks, nrows, _ = write_pandas(conn=ctx, df = db_dat, table_name = 'TESTING_MEMORY')
Changes to Python code - retrieve from snowflake
cur = ctx.cursor()
sql = """
SELECT *
FROM db.schema.TESTING_MEMORY
QUALIFY ROW_NUMBER() OVER (ORDER BY MODEL_DATETIME DESC) = 1
"""
cur.execute(sql)
returned = cur.fetch_pandas_all()
for column in pickled_columns:
returned[column] = base64.b64decode(returned[column].values[0])
new_dict = returned.to_dict('list')
for key,val in new_dict.items():
new_dict[key] = val[0]
Related
TypeError: tuple indices must be integers or slices, not list - when trying to import in SQL Server
I am getting "TypeError: tuple indices must be integers or slices, not list - when trying to import in SQL Server" when trying to insert my data into SQL Server. How can I update my code? I tried changing "records = df_data.values.tolist()" to "records = df_data.values.totuple(), but no success. Adding in more text for text requirements on this post as it says that my post is mostly code and wants me to add more details. Edit: Updated code import yfinance as yf import glob import pandas as pd import pyodbc import os import sqlite3 os.chdir(r"C:\Users\Empyz\Desktop") list1=['2022-03-18', '2022-03-25', '2022-04-01', '2022-04-08', '2022-04-14', '2022-04-22', '2022-05-20', '2022-06-17', '2022-07-15', '2022-10-21', '2023-01-20', '2024-01-19'] gme = yf.Ticker("gme") for date in list1: df = gme.option_chain(date) df_call = df[0] df_put = df[1] df_call.to_csv(f'C:\Empyz\Deskop\\call_{date}.csv', index=False) df_put.to_csv(f'C:\Empyz\Deskop\\put_{date}.csv', index=False) extension = 'csv' all_filenames = [i for i in glob.glob(f'*.{extension}')] #combine all files in the list combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ]) #export to csv combined_csv.to_csv( "Options_Data_Combined.csv", index=False, encoding='utf-8-sig') print(combined_csv) conn = pyodbc.connect('Driver={SQL Server};' 'Server=localhost;' 'Database=Stocks;' 'Trusted_Connection=yes;') c = conn.cursor() table_name = "Options_Data_GME" # create table. I'm lazy so everything is just set to text c.execute(f'CREATE TABLE IF NOT EXISTS [{table_name}] (contractSymbol nvarchar(50), lastTradeDate nvarchar(50), strike decimal(18,4), lastPrice decimal(18,4), bid decimal(18,4), ask decimal(18,4), change decimal(18,4), percentChange decimal(18,4), volume float, openInterest float, impliedVolatility float, inTheMoney nvarchar(50), contractSize nvarchar(50), currency nvarchar(50))') conn.commit() combined_csv.to_sql(table_name, conn, if_exists='replace') c.execute(f''' SELECT * FROM {table_name} ''') for row in c.fetchall(): print (row)
So the easy way is to just use panda's df.to_sql() method. See the docs here. See the code below for a simple implementation using a local sqlite3 database. You should be able to easily adapt this to a pyodbc connection. Note that I've removed your comments and added my own so it's easier to see what I'm doing. import yfinance as yf import glob import pandas as pd import sqlite3 FILEPATH = "./" ###### EDITED list1=['2022-03-18', '2022-03-25', '2022-04-01', '2022-04-08', '2022-04-14', '2022-04-22', '2022-05-20', '2022-06-17', '2022-07-15', '2022-10-21', '2023-01-20', '2024-01-19'] gme = yf.Ticker("gme") for date in list1: df = gme.option_chain(date) df_call = df[0] df_put = df[1] #outputs options data to csv based on dates and type df_call.to_csv(f'{FILEPATH}call_{date}.csv', index=False) ###### EDITED df_put.to_csv(f'{FILEPATH}put_{date}.csv', index=False) ###### EDITED extension = 'csv' # use f-strings instead of .format() all_filenames = [i for i in glob.glob(f'{FILEPATH}*.{extension}')] ###### EDITED # create dataframe with all the CSVs combined. combined_df = pd.concat([pd.read_csv(f) for f in all_filenames ]) # using sqlite3 to create a test db conn = sqlite3.connect('test_database') c = conn.cursor() table_name = "Options_Data_GME" # create table. I'm lazy so everything is just set to text c.execute(f'CREATE TABLE IF NOT EXISTS {table_name} (contractSymbol text, lastTradeDate text, strike text, lastPrice text, bid text, ask text, change text, percentChange text, volume text, openInterest text, impliedVolatility text, inTheMoney text, contractSize text, currency text)') conn.commit() combined_df.to_sql(table_name, conn, if_exists='replace') c.execute(f''' SELECT * FROM {table_name} ''') for row in c.fetchall(): print (row)
How to query MySQL Record value to Python Variables?
I want to give python variables with values that I fetch from MySQL database. #!/usr/bin/python -u # -*- coding: UTF-8 -*- import time import datetime import mysql.connector import sys db = mysql.connector.connect( host = "localhost", user = "admin", password = "admin", db = "testonly" ) mycursor = db.cursor() if __name__ == '__main__': temp = 0 mycursor.execute("SELECT temperature FROM table ORDER BY primarykey DESC LIMIT 1;") #By selecting one column in a row, I fetch only one record from the talbe. data = mycursor.fetchone() for temperature in data: print(temperature) temp = data['temperature'] sys.exit() Then I have error like so: File "test.py", line 28, in <module> temp = data['temperature'] TypeError: tuple indices must be integers, not str In which way I can give value to python variable for later usage?
By default, fetchone returns a tuple with the data from your database. As it currently stands, you need to access your data by index temp = data[0] If you want to access your data by the temperature key, you need to use specify your cursor from mysql.connector.cursor import MySQLCursorDict ... mycursor = db.cursor(cursor_class=MySQLCursorDict) ... temp = data['temperature']
Your object data is a tuple and can't be referenced like that. You need to use this: temp = data[0]
multithreading to load data into sqlite db
I'm downloading a data from an API and storing it in SQLite db. I want to implement the process using "multithreading". Can someone please help me with how to implement it. I found a library but getting an error. below is the code. import sqlite3 import os import pandas as pd from sodapy import Socrata import concurrent.futures dbPath = 'folder where db exists' dbName = 'db file name' ## Setup connection & cursor with the DB dbConn = sqlite3.connect(os.path.join(dbPath, dbName), check_same_thread=False) ## Setup the API and bring in the data client = Socrata("health.data.ny.gov", None) ## Define all the countys to be used in threading countys = [all 62 countys in New York] varDict = dict.fromkeys(countys, {}) strDataList = ['test_date', 'LoadDate'] intDataList = ['new_positives', 'cumulative_number_of_positives', 'total_number_of_tests', 'cumulative_number_of_tests'] def getData(county): ## Check if table exists print("Processing ", county) varDict[county]['dbCurs'] = dbConn.cursor() varDict[county]['select'] = varDict[county]['dbCurs'].execute('SELECT name FROM sqlite_master WHERE type="table" AND name=?', (county,) ) if not len(varDict[county]['select'].fetchall()): createTable(county) whereClause = 'county="'+county+'"' varDict[county]['results'] = client.get("xdss-u53e", where=whereClause) varDict[county]['data'] = pd.DataFrame.from_records(varDict[county]['results']) varDict[county]['data'].drop(['county'], axis=1, inplace=True) varDict[county]['data']['LoadDate'] = pd.to_datetime('now') varDict[county]['data'][strDataList] = varDict[county]['data'][strDataList].astype(str) varDict[county]['data']['test_date'] = varDict[county]['data']['test_date'].apply(lambda x: x[:10]) varDict[county]['data'][intDataList] = varDict[county]['data'][intDataList].astype(int) varDict[county]['data'] = varDict[county]['data'].values.tolist() ## Insert values into SQLite varDict[county]['sqlQuery'] = 'INSERT INTO ['+county+'] VALUES (?,?,?,?,?,?)' varDict[county]['dbCurs'].executemany(varDict[county]['sqlQuery'], varDict[county]['data']) dbConn.commit() # for i in dbCurs.execute('SELECT * FROM albany'): # print(i) def createTable(county): sqlQuery = 'CREATE TABLE ['+county+'] ( [Test Date] TEXT, [New Positives] INTEGER NOT NULL, [Cumulative Number of Positives] INTEGER NOT NULL, [Total Number of Tests Performed] INTEGER NOT NULL, [Cumulative Number of Tests Performed] INTEGER NOT NULL, [Load date] TEXT NOT NULL, PRIMARY KEY([Test Date]))' varDict[county]['dbCurs'].execute(sqlQuery) # for _ in countys: # getData(_) # x = countys[:5] with concurrent.futures.ThreadPoolExecutor() as executor: # results = [executor.submit(getData, y) for y in x] executor.map(getData, countys) getData is the function which brings in the data county wise and loads into the db. Countys is a list of all the countys. I am able to do it synchronously but would like to implement multithreading. The for loop to do it synchronously (which works) is for _ in countys: getData(_) The error message is ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 8016 and this is thread id 19844.
You might find this useful sqlite.connect(":memory:", check_same_thread=False)
How to create a new table in a MySQL DB from a pandas dataframe
I recently transitioned from using SQLite for most of my data storage and management needs to MySQL. I think I've finally gotten the correct libraries installed to work with Python 3.6, but now I am having trouble creating a new table from a dataframe in the MySQL database. Here are the libraries I import: import pandas as pd import mysql.connector from sqlalchemy import create_engine In my code, I first create a dataframe from a CSV file (no issues here). def csv_to_df(infile): return pd.read_csv(infile) Then I establish a connection to the MySQL database using this def function: def mysql_connection(): user = 'root' password = 'abc' host = '127.0.0.1' port = '3306' database = 'a001_db' engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database)) return engine Lastly, I use the pandas function "to_sql" to create the database table in the MySQL database: def df_to_mysql(df, db_tbl_name, conn=mysql_connection(), index=False): df.to_sql(con = conn, name = db_tbl_name, if_exists='replace', index = False) I run the code using this line: df_to_mysql(csv_to_df(r'path/to/file.csv'), 'new_database_table') The yields the following error: InvalidRequestError: Could not reflect: requested table(s) not available in Engine(mysql://root:***#127.0.0.1:3306/a001_db?charset=utf8): (new_database_table) I think this is telling me that I must first create a table in the database before passing the data in the dataframe to this table, but I'm not 100% positive about that. Regardless, I'm looking for a way to create a table in a MySQL database without manually creating the table first (I have many CSVs, each with 50+ fields, that have to be uploaded as new tables in a MySQL database). Any suggestions?
I took an approach suggested by aws_apprentice above which was to create the table first, then write data to the table. The code below first auto-generates a mysql table from a df (auto defining table names and datatypes) then writes the df data to that table. There were a couple of hiccups I had to overcome, such as: unnamed csv columns, determining the correct data type for each field in the mysql table. I'm sure there are multiple other (better?) ways to do this, but this seems to work. import pandas as pd from sqlalchemy import create_engine infile = r'path/to/file.csv' db = 'a001_db' db_tbl_name = 'a001_rd004_db004' ''' Load a csv file into a dataframe; if csv does not have headers, use the headers arg to create a list of headers; rename unnamed columns to conform to mysql column requirements ''' def csv_to_df(infile, headers = []): if len(headers) == 0: df = pd.read_csv(infile) else: df = pd.read_csv(infile, header = None) df.columns = headers for r in range(10): try: df.rename( columns={'Unnamed: {0}'.format(r):'Unnamed{0}'.format(r)}, inplace=True ) except: pass return df ''' Create a mapping of df dtypes to mysql data types (not perfect, but close enough) ''' def dtype_mapping(): return {'object' : 'TEXT', 'int64' : 'INT', 'float64' : 'FLOAT', 'datetime64' : 'DATETIME', 'bool' : 'TINYINT', 'category' : 'TEXT', 'timedelta[ns]' : 'TEXT'} ''' Create a sqlalchemy engine ''' def mysql_engine(user = 'root', password = 'abc', host = '127.0.0.1', port = '3306', database = 'a001_db'): engine = create_engine("mysql://{0}:{1}#{2}:{3}/{4}?charset=utf8".format(user, password, host, port, database)) return engine ''' Create a mysql connection from sqlalchemy engine ''' def mysql_conn(engine): conn = engine.raw_connection() return conn ''' Create sql input for table names and types ''' def gen_tbl_cols_sql(df): dmap = dtype_mapping() sql = "pi_db_uid INT AUTO_INCREMENT PRIMARY KEY" df1 = df.rename(columns = {"" : "nocolname"}) hdrs = df1.dtypes.index hdrs_list = [(hdr, str(df1[hdr].dtype)) for hdr in hdrs] for hl in hdrs_list: sql += " ,{0} {1}".format(hl[0], dmap[hl[1]]) return sql ''' Create a mysql table from a df ''' def create_mysql_tbl_schema(df, conn, db, tbl_name): tbl_cols_sql = gen_tbl_cols_sql(df) sql = "USE {0}; CREATE TABLE {1} ({2})".format(db, tbl_name, tbl_cols_sql) cur = conn.cursor() cur.execute(sql) cur.close() conn.commit() ''' Write df data to newly create mysql table ''' def df_to_mysql(df, engine, tbl_name): df.to_sql(tbl_name, engine, if_exists='replace') df = csv_to_df(infile) create_mysql_tbl_schema(df, mysql_conn(mysql_engine()), db, db_tbl_name) df_to_mysql(df, mysql_engine(), db_tbl_name)
This connection = engine.connect() df.to_sql(con=connection, name='TBL_NAME', schema='SCHEMA', index=False, if_exists='replace') works with oracle DB in specific schema wothout errors, but will not work if you have limited permissions. And note that table names is case sensative.
CSV - MYSQL Using Python
After reading several inputs I still can't get this to work. Most likely I'm doing it all wrong but I've tried several different approaches What I'm trying to do is extract data from a CSV and add it into my newly created database/table My csv input look like this NodeName,NeId,Object,Time,Interval,Direction,NeAlias,NeType,Position,AVG,MAX,MIN,percent_0-5,percent_5-10,percent_10-15,percent_15-20,percent_20-25,percent_25-30,percent_30-35,percent_35-40,percent_40-45,percent_45-50,percent_50-55,percent_55-60,percent_60-65,percent_65-70,percent_70-75,percent_75-80,percent_80-85,percent_85-90,percent_90-95,percent_95-100,IdLogNum,FailureDescription X13146PAZ,5002,1/11/100,2016-05-16 00:00:00,24,Near End,GE0097-TN01.1,AMM 20PB,-,69684,217287,772,10563,8055,10644,15147,16821,13610,7658,2943,784,152,20,3,0,0,0,0,0,0,0,0,0,- ... X13146PAZ,5002,1/11/102,2016-05-16 00:00:00,24,Near End,GE0097-TN01.1,AMM 20PB,-,3056,28315,215,86310,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,- ... X13146PAZ,5002,1/11/103,2016-05-16 00:00:00,24,Near End,GE0097-TN01.1,AMM 20PB,-,769,7195,11,86400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,- The mysql table is created but possibly that might be the issue as some ar varchar columns and some are integer columns My server is a Ubuntu if that is of any use My Code # -*- coding: utf-8 -*- #Imports from datetime import date, timedelta import sys import MySQLdb as mdb import csv import os #Vars Yesterday = date.today() - timedelta(1) #Opening document RX_Document = open('./reports/X13146PAZ_TN_WAN_ETH_BAND_RX_' + Yesterday.strftime("%Y%m%d") + "_231500.csv" , 'r') RX_Document_Str = './reports/X13146PAZ_TN_WAN_ETH_BAND_RX_' + Yesterday.strftime("%Y%m%d") + "_231500.csv" csv_data = csv.reader(file(RX_Document_Str)) con = mdb.connect('localhost', 'username', 'password','tn_rx_utilization'); counter = 0 for row in csv_data: if counter == 0: print row continue counter = 1 if counter == 1: cur = con.cursor() cur.execute('INSERT INTO RX_UTIL(NodeName, NeId, Object, Time, Interval1,Direction,NeAlias,NeType,Position,AVG,MAX,MIN,percent_5-10,percent_10-15,percent_15-20,percent_20-25,percent_25-30,percent_30-35,percent_35-40,percent_40-45,percent_45-50,percent_50-55,percent_55-60,percent_60-65,percent_65-70,percent_70-75,percent_75-80,percent_80-85,percent_85-90,percent_90-95,percent_95-100,IdLogNum,FailureDescription)' 'VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")',tuple(row[:34])) con.commit() #cur.execute("SELECT VERSION()") #ver = cur.fetchone() con.commit() con.close()
You should not put the placeholder %s in quotes ": cur.execute('''INSERT INTO RX_UTIL(NodeName, NeId, Object, Time, Interval1,Direction, NeAlias,NeType,Position,AVG,MAX,MIN,"percent_5-10","percent_10-15", "percent_15-20","percent_20-25","percent_25-30","percent_30-35", "percent_35-40","percent_40-45","percent_45-50","percent_50-55", "percent_55-60","percent_60-65","percent_65-70","percent_70-75", "percent_75-80","percent_80-85","percent_85-90","percent_90-95", "percent_95-100",IdLogNum,FailureDescription) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''', tuple(row[:33]))
You are missing Percent_0-5 from your Insert Remove the quotes from the %s references, this needs to be in String format, but the underlying data type will be passed. There may be issues with datatype resulting from the csv reader. Have Python eval() the csv data to alter type as an INT. Here is some more information from another post: Read data from csv-file and transform to correct data-type cur.execute('INSERT INTO RX_UTIL(NodeName, NeId, Object, Time, Interval1,Direction,NeAlias,NeType,Position,AVG,MAX,MIN,percent_0-5,percent_5-10,percent_10-15,percent_15-20,percent_20-25,percent_25-30,percent_30-35,percent_35-40,percent_40-45,percent_45-50,percent_50-55,percent_55-60,percent_60-65,percent_65-70,percent_70-75,percent_75-80,percent_80-85,percent_85-90,percent_90-95,percent_95-100,IdLogNum,FailureDescription)' 'VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',tuple(row[:34]))