Merge SQL tables into One using Python - python

I'm trying to merge several SQL tables from a DB where I don't have direct access, but just the ODBC connection open. All my tables contain the same structure and format. I currently have a code that does pretty much the old school thing but my DB tables are huge with millions of records, hence it's affecting my memory and storage. Any better to run this program and manage memory more efficiently.
Currently what I do:
import pandas as pd, shutil, glob
table1= pd.read_sql_query("select * from table1")
table1.to_csv(":/file1.csv")
table2= pd.read_sql_query("select * from table2")
table2.to_csv(":/file2.csv")
table3= pd.read_sql_query("select * from table3")
table3.to_csv(":/file3.csv")
#Merging the files
path = r'\\file.*' #Path were your files are located
allFiles = glob.glob(path + "/*.csv")
allFiles.sort()
with open('C:\\Desktop\\Outuput_file.csv', 'wb') as outfile:
for i, fname in enumerate(allFiles):
with open(fname, 'rb') as infile:
if i != 0:
infile.readline()
shutil.copyfileobj(infile, outfile)
print(fname + " has been imported.")

You can use append mode in pandas to_csv function.
For example:
table1= pd.read_sql_query("select * from table1")
table1.to_csv("C:\\Desktop\\Outuput_file.csv", mode='a')
or
table2= pd.read_sql_query("select * from table2")
table2.to_csv("C:\\Desktop\\Outuput_file.csv", mode='a', header=False)
referance

Related

Handle big files with python & pandas

Thanks for reading my post.
I need to deal with big files, let me give you more context, I extract some tables from a database convert those tables to CSV and after that, I convert them to JSON.
All that is to send the information to BigQuery.
Now my script works fine but I have a problem, some tables I extract are so so big one of them has 14 Gb, my problem is my server memory just has 8 Gb, exist any way to integrate some to my script to split or append the information ???
My script:
import pyodbc
import fileinput
import csv
import pandas as pd
import json
import os
import sys
conn = pyodbc.connect("Driver={SQL Server};"
"Server=TEST;"
"username=test;"
"password=12345;"
"Database=TEST;"
"Trusted_Connection=no;")
cursor = conn.cursor()
query = "SELECT * FROM placeholder where "
with open(r"D:\Test.txt") as file:
lines = file.readlines()
print(lines)
for user_input in lines:
result = query.replace("placeholder", user_input)
print(result)
sql_query = pd.read_sql(result,conn)
df = pd.DataFrame(sql_query)
user_inputs = user_input.strip("\n")
filename = os.path.join('D:\\', user_inputs + '.csv')
df.to_csv (filename, index = False, encoding='utf-8', sep = '~', quotechar = "`", quoting=csv.QUOTE_ALL)
print(filename)
filename_json = os.path.join('D:\\', user_inputs + '.jsonl')
csvFilePath = (filename)
jsonFilePath = (filename_json)
print(filename_json)
df_o = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df_o.to_json(filename_json, orient = "records", lines = True, date_format = "iso", double_precision = 15, force_ascii = False, date_unit = 'ms', default_handler = str)
dir_name = "D:\\"
test = os.listdir(dir_name)
for item in test:
if item.endswith(".csv"):
os.remove(os.path.join(dir_name, item))
cursor.close()
conn.close()
I'm really new to python, I hope you can help me to integrate some into my script.
Really thanks so many guys !!!
Kind regards.
For large data sets you should avoid reading all of it at once and then writing it all at once. You should do partial reads and partial writes.
Since you are using BigQuery you should use paritions to limit the query output. Have some logic to update the partition offsets. For each partition you can generate one file per parition. In this case your output would be like output-1.csv, output-2.csv etc.
An example of using parition:
SELECT * FROM placeholder
WHERE transaction_date >= '2016-01-01'
As a bonus tip, avoid doing Select * as BigQuery is columnar storage system mentioning the columns you would want to read will significatnly improve the peformance.

Issue creating CSV from Access database

I have a database in MS Access. I am trying to export one column from one table to a CSV file, with Python using pypyodbc.
From the CSV file obtained, there are no commas in numbers greater than 1. Any idea to solve?
Screen from MS Access:
MS Access database
Screen from the obtained CSV:
CSV
Code:
import pypyodbc
import csv
import os
from pathlib import Path
import re
data_folder1 = Path("/Users/sfulc/Desktop/FileProva/")
data_folder2 = Path("/Users/sfulc/Desktop/FileOutput/")
for filename in os.listdir("/Users/sfulc/Desktop/FileProva/"):
file1 = r"Dbq=" + os.path.abspath(data_folder1 / filename) + r";"
file2 = re.sub("mdb", "csv", os.path.abspath(data_folder2 / filename))
pypyodbc.lowercase = False
conn = pypyodbc.connect(r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};" + file1)
cur = conn.cursor()
cur.execute("SELECT LoadValue FROM OriginalData");
with open(file2, 'w', newline='') as f:
writer = csv.writer(f)
for row in cur.fetchall():
writer.writerow(row)
cur.close()
conn.close()

Python - Execute sql statements from list of files in a folder and create output files for each of them

I wrote the following script to read the sql queries from each file in the folder and then write the output data to a different folder for each file. I am using pandas here. Do I really need to use pandas here as I am not doing any data manipulation. If so, how can execute the scripts and save the output?
conn = pyodbc.connect('DRIVER={SQL Server};SERVER=' + DB['servername'] + ';DATABASE=' + DB['database'] + ';Trusted_Connection=yes;')
os.chdir(input_path)
for filename in glob.glob('*.sql'):
with open(filename, 'r') as f: # open in readonly mode
query = open(filename, 'r')
df = pd.read_sql_query(query.read(),conn)
output_file_name = filename.rsplit( ".", 1 )[ 0 ] + '.txt'
output_file = os.path.join(output_path, output_file_name)
df.to_csv(output_file, sep='\t', index=False)
query.close()
conn.close()

Why is SQLite3 Querying slower than building a new directory index with pandas

I have a script that needs all files in a given directory to be indexed for easier querying(ex: find the file with a given filename that has the most recent modified date). I have been walking through the entire directory and adding each file to a dataframe every time the script runs. I would think that instead of gathering this data for every file, It would be quicker to query a database and only update if the files modified data changed but in testing, this was way slower. Why is this? Is there a more efficient way to do this?
Result:
Index with pandas took 0:06:53.352515
Innitial index with SQLite3 took 0:43:20.042651
second index with SQLite3 took 0:21:48.863566
"""
{Description}
The purpose of this exercise is to index a directory for later use.
We will try two methods to quantify the benefits of updating a SQLite db
rather than re-indexing into a pandas dataframe each time.
"""
# Built-in/Generic Imports
import os
import sys
# […]
# Libs
import pandas as pd # Or any other
import datetime
import sqlite3
def main():
dbPath = os.path.join(os.getcwd(), 'testing.db')
indexPath = Random_Directory_with_tons_of_files
setupSQL(dbPath)
print('Index with pandas took ' + str(testPandas(indexPath)))
print('Innitial index with SQLite3 took ' + str(testSQLite(indexPath, dbPath)))
print('second index with SQLite3 took ' + str(testSQLite(indexPath, dbPath)))
def setupSQL(dbPath):
if os.path.exists(dbPath):
os.remove(dbPath)
conn = sqlite3.connect(dbPath)
c = conn.cursor()
c.execute('''CREATE TABLE testTable
(fullPath, fileName, modifiedDate)''')
conn.commit()
conn.close()
def testPandas(path):
startTime = datetime.datetime.now()
testIndex = pd.DataFrame(columns=['fullPath', 'fileName', 'modifiedDate'])
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
testIndex = testIndex.append({'fullPath' : os.path.join(dirpath, filename),
'fileName' : filename,
'modifiedDate' : os.path.getmtime(os.path.join(dirpath, filename))}, ignore_index=True)
return datetime.datetime.now() - startTime
def testSQLite(path, dbPath):
startTime = datetime.datetime.now()
conn = sqlite3.connect(dbPath)
c = conn.cursor()
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
c.execute('SELECT * FROM testTable WHERE fullPath=?', [(os.path.join(dirpath, filename))])
row = c.fetchone()
if row == None:
#File is not in database, add it
c.execute('INSERT INTO testTable VALUES (?,?,?)', [
(os.path.join(dirpath, filename)),
(filename),
(os.path.getmtime(os.path.join(dirpath, filename)))
])
conn.commit()
elif row[2] != os.path.getmtime(os.path.join(dirpath, filename)):
#Modified Date has changed, update it.
c.execute('UPDATE testTable SET modifiedDate=? WHERE fullPath=?', [(os.path.getmtime(os.path.join(dirpath, filename))),(os.path.join(dirpath, filename))])
conn.commit()
conn.close()
return datetime.datetime.now() - startTime
if __name__ == '__main__':
print('Starting')
main()
print('Done')
The biggest difference between sqlite and pandas is that sqlite is a file based database. When sqlite connects to a database it only loads certain parts of the file that holds the actual data. Once you query it for something it goes back and reads the file until it finds what it needs then returns that to you. Here is a stack overflow question asking about sqlite and how it loads data.
Sqlite
One thing you could try to see how fast it would speed up is to have sqlite create an inmemory database. Instead of passing in a path to a file for sqlite just pass in :MEMORY: and that should create a in memory sqlite databse. Then retry your benchmark and see if it improves in speed. WARNING depending on how many files you're grabbing you may use a lot of memory for this.
conn = sqlite3.connect(':memory:')

How to open and convert sqlite database to pandas dataframe

I have downloaded some datas as a sqlite database (data.db) and I want to open this database in python and then convert it into pandas dataframe.
This is so far I have done
import sqlite3
import pandas
dat = sqlite3.connect('data.db') #connected to database with out error
pandas.DataFrame.from_records(dat, index=None, exclude=None, columns=None, coerce_float=False, nrows=None)
But its throwing this error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 980, in from_records
coerce_float=coerce_float)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 5353, in _to_arrays
if not len(data):
TypeError: object of type 'sqlite3.Connection' has no len()
How to convert sqlite database to pandas dataframe
Despite sqlite being part of the Python Standard Library and is a nice and easy interface to SQLite databases, the Pandas tutorial states:
Note In order to use read_sql_table(), you must have the SQLAlchemy
optional dependency installed.
But Pandas still supports sqlite3 access if you want to avoid installing SQLAlchemy:
import sqlite3
import pandas as pd
# Create your connection.
cnx = sqlite3.connect('file.db')
df = pd.read_sql_query("SELECT * FROM table_name", cnx)
As stated here, but you need to know the name of the used table in advance.
The line
data = sqlite3.connect('data.db')
opens a connection to the database. There are no records queried up to this. So you have to execute a query afterward and provide this to the pandas DataFrame constructor.
It should look similar to this
import sqlite3
import pandas as pd
dat = sqlite3.connect('data.db')
query = dat.execute("SELECT * From <TABLENAME>")
cols = [column[0] for column in query.description]
results= pd.DataFrame.from_records(data = query.fetchall(), columns = cols)
I am not really firm with SQL commands, so you should check the correctness of the query. should be the name of the table in your database.
Parsing a sqlite .db into a dictionary of dataframes without knowing the table names:
def read_sqlite(dbfile):
import sqlite3
from pandas import read_sql_query, read_sql_table
with sqlite3.connect(dbfile) as dbcon:
tables = list(read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", dbcon)['name'])
out = {tbl : read_sql_query(f"SELECT * from {tbl}", dbcon) for tbl in tables}
return out
Search sqlalchemy, engine and database name in google (sqlite in this case):
import pandas as pd
import sqlalchemy
db_name = "data.db"
table_name = "LITTLE_BOBBY_TABLES"
engine = sqlalchemy.create_engine("sqlite:///%s" % db_name, execution_options={"sqlite_raw_colnames": True})
df = pd.read_sql_table(table_name, engine)
I wrote a piece of code up that saves tables in a database file such as .sqlite or .db and creates an excel file out of it with each table as a sheet or makes individual tables into csvs.
Note: You don't need to know the table names in advance!
import os, fnmatch
import sqlite3
import pandas as pd
#creates a directory without throwing an error
def create_dir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
print("Created Directory : ", dir)
else:
print("Directory already existed : ", dir)
return dir
#finds files in a directory corresponding to a regex query
def find(pattern, path):
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
#convert sqlite databases(.db,.sqlite) to pandas dataframe(excel with each table as a different sheet or individual csv sheets)
def save_db(dbpath=None,excel_path=None,csv_path=None,extension="*.sqlite",csvs=True,excels=True):
if (excels==False and csvs==False):
print("Atleast one of the parameters need to be true: csvs or excels")
return -1
#little code to find files by extension
if dbpath==None:
files=find(extension,os.getcwd())
if len(files)>1:
print("Multiple files found! Selecting the first one found!")
print("To locate your file, set dbpath=<yourpath>")
dbpath = find(extension,os.getcwd())[0] if dbpath==None else dbpath
print("Reading database file from location :",dbpath)
#path handling
external_folder,base_name=os.path.split(os.path.abspath(dbpath))
file_name=os.path.splitext(base_name)[0] #firstname without .
exten=os.path.splitext(base_name)[-1] #.file_extension
internal_folder="Saved_Dataframes_"+file_name
main_path=os.path.join(external_folder,internal_folder)
create_dir(main_path)
excel_path=os.path.join(main_path,"Excel_Multiple_Sheets.xlsx") if excel_path==None else excel_path
csv_path=main_path if csv_path==None else csv_path
db = sqlite3.connect(dbpath)
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(len(tables),"Tables found :")
if excels==True:
#for writing to excel(xlsx) we will be needing this!
try:
import XlsxWriter
except ModuleNotFoundError:
!pip install XlsxWriter
if (excels==True and csvs==True):
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
i=0
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
i+=1
print("Parsing Excel Sheet ",i," : ",table_name)
table.to_excel(writer, sheet_name=table_name, index=False)
print("Parsing CSV File ",i," : ",table_name)
table.to_csv(os.path.join(csv_path,table_name + '.csv'), index_label='index')
writer.save()
elif excels==True:
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
i=0
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
i+=1
print("Parsing Excel Sheet ",i," : ",table_name)
table.to_excel(writer, sheet_name=table_name, index=False)
writer.save()
elif csvs==True:
i=0
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
i+=1
print("Parsing CSV File ",i," : ",table_name)
table.to_csv(os.path.join(csv_path,table_name + '.csv'), index_label='index')
cursor.close()
db.close()
return 0
save_db();
If data.db is your SQLite database and table_name is one of its tables, then you can do:
import pandas as pd
df = pd.read_sql_table('table_name', 'sqlite:///data.db')
No other imports needed.
i have stored my data in database.sqlite table name is Reviews
import sqlite3
con=sqlite3.connect("database.sqlite")
data=pd.read_sql_query("SELECT * FROM Reviews",con)
print(data)

Categories