SQLITE db files not closed. db-shm and db-wal files remain - python

I want to write some basic code to do querys on read only mode on sqlite databases
These are daily db files so it is important after closing the connections not to leave other files in the server like db-shm or db-wal files associated
I have been reading the documentation and it seems that even though I try to close the connections explicitly, these are not closed so these files stay there
import sqlite3
import pandas as pd
def _connect(f):
con = None
try:
con = sqlite3.connect("file:"+f+"?mode=ro", uri=True)
except sqlite3.Error as er:
print('SQLite error in the db connection: %s' % (' '.join(er.args)))
return con
def _disconnect(con):
try:
con.close()
except sqlite3.Error as er:
print('SQLite error in the db disconnection: %s' % (' '.join(er.args)))
return con
def fl_query(file):
'''Returns information about a db file
file : string
absolute path to the db file
returns
-------
list
'''
cn = _connect(file)
cur = cn.cursor()
query = """SELECT ....etc"""
cur.execute(query)
info = [(row[0],row[1],row[2],row[3],row[4],row[5]) for row in cur.fetchall()]
cur.close()
_disconnect(cn)
return info
folder = 'path to the db file'
file = folder+'6100000_2022-09-18.db'
info = fl_query(file)
I have read about how to close cleanly the databases but so far nothing works and db-shm and db-wal stay there everytime I open a file. Remark: it is a server with thousand of files so it is important not to create more files

I realized that I was unable to modify the PRAGMA Journal because I was opening the database in read only mode. In read-write mode it performs the modification of the PRAGMA journal to DELETE mode
import sqlite3
import pandas as pd
def _connect(f):
con = None
try:
con = sqlite3.connect("file:"+f+"?mode=rw", uri=True)
except sqlite3.Error as er:
print('SQLite error in the db connection: %s' % (' '.join(er.args)))
return con
def _disconnect(con):
try:
con.close()
except sqlite3.Error as er:
print('SQLite error in the db disconnection: %s' % (' '.join(er.args)))
return con
def fl_query(file):
'''Returns information about a db file
file : string
absolute path to the db file
returns
-------
list
'''
cn = _connect(file)
cur = cn.cursor()
cur.execute("PRAGMA journal_mode=DELETE")
query = """SELECT ....etc"""
cur.execute(query)
info = [(row[0],row[1],row[2],row[3],row[4],row[5]) for row in cur.fetchall()]
cur.close()
_disconnect(cn)

Related

Python Utility To Compare Data Between Sql Server Tables and Similar Snowflake Tables

I'm a newbie to python and would require your help in building this utility.
Use case: I need to build a python utility which compares and does basic data validation like row count, count of columns on those tables between sql server and snowflake tables. The list of the tables needs to be extracted by reading and looping an excel file (list of sql server tables v/s snowflake tables listed there.). The difference is to be written in a separate file.
Code :
# -------------- Import packages needed ----------------------------
import sys, os, pyodbc, datetime, collections
import pandas as pd
import snowflake.connector as sf
import sqlalchemy as sa
#import SNCC_Conn as sfconn
pd.set_option("display.max_rows", 999)
# set params for Snowflake Connection
sncc_auth = 'externalbrowser'
sncc_user = 'xxx'
sncc_warehouse = 'xxx'
sncc_db = 'xxx'
sncc_sch = 'SFSCHEMA'
sncc_tbl = 'TABLE_1'
sncc_qry = 'SELECT COUNT(*) FROM '+sncc_sch+'.'+sncc_tbl+''
#sncc_qry1 = 'SELECT COUNT(COLUMN_NAME) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME ='+sncc_tbl''
sf_qry = r'' + sncc_qry
# set params for SQL Connection TST .
sql_srvr = 'xxxx'
sql_db = 'xxx'
sql_user = 'xxx'
sql_pwd = 'xxx'
sql_driver = '{ODBC Driver 17 for SQL Server}'
sql_sch = 'SQLSCHEMA'
sql_tbl = 'TABLE_1'
ms_sql_qry = 'SELECT COUNT(*) FROM '+sql_sch+'.' +sql_tbl+''
#ms_sql_qry1 = 'SELECT COUNT(*) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = 'allegro' and TABLE_NAME = 'trade''
fileName = 'SQL_SF_Comparison'
# --------------------------- Snowflake Connection ---------------------------
try:
sf_conn = sf.connect(authenticator='externalbrowser',
user='xxxx',
account='xxx',
warehouse='xxx',
database='xxx',
schema ='',
role=''
)
except Exception as e:
print('Connection Failed. Please try again.')
print('Error: ' + str(e) )
quit()
print('Snowflake Connection established!')
print(sf_qry)
try:
# excute the query
sf_conn.execute(sf_qry)
# Fetch all snowflake results into a Pandas Dataframe
sf_df = sf_conn.fetch_pandas_all()
# Make all Dataframe Columns Uppercase
sf_df.columns = map(str.upper, sf_df.columns)
# Print out results on screen during development phase.
print(sf_df)
print(sf_df.columns)
print('Snowflake Dataframe Load Successful.')
except Exception as e:
print('Snowflake Dataframe load Unsuccessful. Please try again.')
print('Error: ' + str(e) )
# # --------------------------- SQL Server Connection ---------------------------
try:
# single '\' provides a concat to the DRIVE, SERVER, DATABASE, trusted connection lines, as if a single line of code.
sql_conn = pyodbc.connect('DRIVER='+sql_driver+';SERVER=tcp:'+sql_srvr+';PORT=1433;DATABASE='+sql_db+';UID='+sql_user+';PWD='+ sql_pwd+'') # Using Windows User Account for authentication.
cursor = sql_conn.cursor()
print('SQL Server Connection established!')
print(ms_sql_qry)
except Exception as e:
print('Connection Failed. Please try again.')
print('Error: ' + str(e) )
try:
# Query results and place them in variable
# cursor.execute(sql_qry)
sql_qry = pd.read_sql_query(ms_sql_qry,sql_conn)
# Put results into a Data Frame from Pandas
sql_df = pd.DataFrame(sql_qry)
# Make all Dataframe Columns Uppercase
sql_df.columns = map(str.upper, sql_df.columns)
# Print out results during development phase.
print(sql_df)
print(sql_df.columns)
print('SQL Server Dataframe Load Successful')
print('Comparing SQL to SNCC Dataframes')
#/********************* COMPARISON SCRIPT **************/
#sql_df.compare(sncc_df)
# Compare the two DataFrames and produce results from Source (sql_df) that do not match Target (sf_df).
df_diff = sql_df[sf_df != sql_df]
# print out results of differences during development phase.
print(df_diff)
# Export out to CSV using a variable for the name of the file, future state.
df_diff.to_csv(r'D:\PythonResults\DataDiff_' + fileName + '.csv', index = False)
print('Datafram output from comparison outputed to PythonResults folder in Documents as DataDiff_' + fileName + 'csv.')
except pyodbc.Error as e:
# Message stating export unsuccessful.
print("MSSQL Dataframe load unsuccessful.")
finally:
sf_conn.close()
print("Connection to Snowflake closed")
sql_conn.commit()
sql_conn.close()
print("Connection to MSSQL Server closed")
File Data and file name :
Tables.xlsx
Help me in completing the code in extracting the list of tables from the excel file and loop the data and load them to datframes and compare them
See pyodbc Connecting to Microsoft Excel for details on reading from Excel.

Getting row counts from Redshift during unload process and counting rows loaded in S3

My python code looks like below where I am unloading data from Redshift to Amazon S3 bucket. I am trying to get row count from Redshift and S3 bucket to ensure that all the data is loaded. Additionally, I would also like to get last uploaded date from S3 bucket so that I know when last unload was performed. Kindly suggest the code with explanation.
Thanks in advance for your time and efforts!
import csv
import redshift_connector
import sys
CSV_FILE="Tables.csv"
CSV_DELIMITER=';'
S3_DEST_PATH="s3://..../"
DB_HOST="MY HOST"
DB_PORT=1234
DB_DB="MYDB"
DB_USER="MY_READ"
DB_PASSWORD="MY_PSWD"
IM_ROLE="arn:aws:iam::/redshift-role/unload data","arn:aws::iam::/write in bucket"
def get_tables(path):
tables=[]
with open (path, 'r') as file:
csv_reader = csv.reader (file,delimiter=CSV_DELIMITER)
header = next(csv_reader)
if header != None:
for row in csv_reader:
tables.append(row)
return tables
def unload(conn, tables, s3_path):
cur = conn.cursor()
for table in tables:
print(f">{table[0]}.{table[1]}")
try:
query= f'''unload('select * from {table[0]}.{table[1]}' to '{s3_path}/{table[1]}/'
iam_role '{IAM_ROLE}'
CSV
PARALLEL FALSE
CLEANPATH;'''
print(f"loading in progress")
cur.execute(query)
print(f"Done.")
except Esception as e:
print("Failed to load")
print(str(e))
sys.exit(1)
cur.close()
def main():
try:
conn = redshift_connector.connect(
host=DB_HOST,
port=DB_PORT,
database= DB_DB,
user= DB_USER,
password=DB_PASSWORD
)
tables = get_tables(CSV_FILE)
unload(conn,tables,S3_DEST_PATH)
conn.close()
except Exception as e:
print(e)
sys.exit(1)
Update code based on SO User's comment
tables=['schema1.tablename','schema2.table2']
conn=redshift_connector.connect(
host='my_host',
port= "my_port",
database='my_db'
user="user"
password='password')
cur=conn.cursor()
cur.execute ('select count(*) from {',' .join("'"+y+"'" for y in tables)}')
results=cur.fetchall()
print("The table {} contained".format(tables[0]),*result[0],"rows"+"\n" ) #Printing row counts along with table names
cur.close()
conn.close()
2nd Update:
tables=['schema1.tablename','schema2.table2']
conn=redshift_connector.connect(
host='my_host',
port= "my_port",
database='my_db'
user="user"
password='password')
cur=conn.cursor()
for table in tables:
cur.execute(f'select count(*) from {table};')
results=cur.fetchone()
for row in result:
print("The table {} contained".format(tables[0]),result[0],"rows"+"\n" ) #Printing row counts along with table names
The simple query to get number of rows is
query = "select count(*) from {table_name}"
For Redshift, all you need to do is
cur.execute(query)
row_count = cur.fetchall()
Using boto3, you can use a similar SQL query to fetch S3 row count as well, as elucidated in this answer.
Edit:
Corrected your updated approach a little:
cur=conn.cursor()
for table in tables:
cur.execute(f'select count(*) from {table};')
result=cur.fetchone()
count = result[0] if result else 0
print(f"The table {table} contained {count} rows.\n" )

Open database files (.db) using python

I have a data base file .db in SQLite3 format and I was attempting to open it to look at the data inside it. Below is my attempt to code using python.
import sqlite3
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)
cur = con.cursor()
# The result of a "cursor.execute" can be iterated over by row
for row in cur.execute("SELECT * FROM "):
print(row)
# Be sure to close the connection
con.close()
For the line ("SELECT * FROM ") , I understand that you have to put in the header of the table after the word "FROM", however, since I can't even open up the file in the first place, I have no idea what header to put. Hence how can I code such that I can open up the data base file to read its contents?
So, you analyzed it all right. After the FROM you have to put in the tablenames. But you can find them out like this:
SELECT name FROM sqlite_master WHERE type = 'table'
In code this looks like this:
# loading in modules
import sqlite3
# creating file path
dbfile = '/home/niklas/Desktop/Stuff/StockData-IBM.db'
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)
# creating cursor
cur = con.cursor()
# reading all table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# here is you table list
print(table_list)
# Be sure to close the connection
con.close()
That worked for me very good. The reading of the data you have done already right just paste in the tablenames.
If you want to see data for visual analysis as pandas dataframe, the below approach could also be used.
import pandas as pd
import sqlite3
import sqlalchemy
try:
conn = sqlite3.connect("file.db")
except Exception as e:
print(e)
#Now in order to read in pandas dataframe we need to know table name
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(f"Table Name : {cursor.fetchall()}")
df = pd.read_sql_query('SELECT * FROM Table_Name', conn)
conn.close()
from flask import Flask
app = Flask(__name__)
from sqlalchemy import create_engine, select, MetaData, Table
from sqlalchemy.sql import and_, or_
engine = create_engine('sqlite://username:password#host/databasename')
class UserModel():
def __init__(self):
try:
self.meta = MetaData()
self.users = Table("users", self.meta, autoload=True, autoload_with=engine)
except Exception as e:
print(e)
def get(self):
stmt = select([self.users.c.name, self.users.c.email, self.users.c.password])
print(stmt)
result = engine.execute(stmt)
temp = [dict(r) for r in result] if result else None
print(temp)
return temp

how to solve MySQL server has gone with query

I am getting below error
"Failed inserting BLOB data into MySQL table 2006 (HY000): MySQL
server has gone away"
def insertBLOB(request):
id =(1)
name=("Eric")
photo = ("/home/gauri/Pictures/w3schools.jpg")
biodata = ("/home/gauri/Downloads/mnum.npy")
try:
connection = mysql.connector.connect(host="127.0.0.1", user="root", passwd="",
port=3307, db="data2")
cursor = connection.cursor()
sql_insert_blob_query = "INSERT INTO python_employee(id, name, photo, biodata)
VALUES (%s,%s,%s,%s)"
with open(photo, 'rb') as file:
binaryData = file.read()
with open(biodata, 'rb') as file:
binaryData1 = file.read()
insert_blob_tuple = (id, name, binaryData, binaryData1,)
result = cursor.execute(sql_insert_blob_query,insert_blob_tuple)
connection.commit()
print("Image and file inserted successfully as a BLOB into python_employee
table",result)
except mysql.connector.Error as error:
print("Failed inserting BLOB data into MySQL table {}".format(error))
return HttpResponse("data")
Add the following line into my.cnf file should solve your problem.
max_allowed_packet=64M
Here is the explanation.

Why SQLITE doesn't accept My INTEGER/TEXT data larger than 8, using Python 3?

Problem
I am trying to read a csv file to Pandas, and write it to a SQLite database.Process works for all the columns in the csv file except for "Fill qty" which is a Positive Integer(int64). The process changes the type from TEXT/INTEGER to BLOB.
So I tried to load only the "Fll qty" column from Pandas to SQLite, and surprisingly I noticed I can safely do that for all integers smaller than 10 (I don't have 9 in my dataset, so basically 1,2,...,8 loaded successfully).
Here is what I tried:
I tried what I could think of: change "Fill_Qty" type in Schema to INTEGER to REAL, NULL or TEXT , change data type in Pandas from int64 to float or string before inserting to SQLite table. None of them worked. By the look of it, the "Trade_History.csv" file seems to be fine in Pandas or Excel. Is there something that my eyes dont see?!? So I am really confused what is happening here!
You would need the .csv file to test the code. Here is the code and .csv file: https://github.com/Meisam-Heidari/Trading_Min_code
The code:
### Imports:
import pandas as pd
import numpy as np
import sqlite3
from sqlite3 import Error
def create_database(db_file):
try:
conn = sqlite3.connect(db_file)
finally:
conn.close()
def create_connection(db_file):
""" create a database connection to the SQLite database
specified by db_file
:param db_file: database file
:return: Connection object or None
"""
try:
conn = sqlite3.connect(db_file)
return conn
return None
def create_table(conn,table_name):
try:
c = conn.cursor()
c.execute('''CREATE TABLE {} (Fill_Qty TEXT);'''.format(table_name))
except Error as e:
print('Error Code: ', e)
finally:
conn.commit()
conn.close()
return None
def add_trade(conn, table_name, trade):
try:
print(trade)
sql = '''INSERT INTO {} (Fill_Qty)
VALUES(?)'''.format(table_name)
cur = conn.cursor()
cur.execute(sql,trade)
except Error as e:
print('Error When trying to add this entry: ',trade)
return cur.lastrowid
def write_to_db(conn,table_name,df):
for i in range(df.shape[0]):
trade = (str(df.loc[i,'Fill qty']))
add_trade(conn,table_name,trade)
conn.commit()
def update_db(table_name='My_Trades', db_file='Trading_DB.sqlite', csv_file_path='Trade_History.csv'):
df_executions = pd.read_csv(csv_file_path)
create_database(db_file)
conn = create_connection(db_file)
table_name = 'My_Trades'
create_table(conn, table_name)
# writing to DB
conn = create_connection(db_file)
write_to_db(conn,table_name,df_executions)
# Reading back from DB
df_executions = pd.read_sql_query("select * from {};".format(table_name), conn)
conn.close()
return df_executions
### Main Body:
df_executions = update_db()
Any alternatives
I am wondering if anyone have a similar experience? Any advices/solutions to help me load the data in SQLite?
I am Trying to have something light and portable and unless there is no alternatives, I prefer not to go with Postgres or MySQL.
You're not passing a container to .execute() when inserting the data. Reference: https://www.python.org/dev/peps/pep-0249/#id15
What you need to do instead is:
trade = (df.loc[i,'Fill qty'],)
# ^ this comma makes `trade` into a tuple
The types of errors you got would've been:
ValueError: parameters are of unsupported type
Or:
sqlite3.ProgrammingError: Incorrect number of bindings supplied. The
current statement uses 1, and there are 2 supplied.

Categories