How to open and convert sqlite database to pandas dataframe - python

I have downloaded some datas as a sqlite database (data.db) and I want to open this database in python and then convert it into pandas dataframe.
This is so far I have done
import sqlite3
import pandas
dat = sqlite3.connect('data.db') #connected to database with out error
pandas.DataFrame.from_records(dat, index=None, exclude=None, columns=None, coerce_float=False, nrows=None)
But its throwing this error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 980, in from_records
coerce_float=coerce_float)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 5353, in _to_arrays
if not len(data):
TypeError: object of type 'sqlite3.Connection' has no len()
How to convert sqlite database to pandas dataframe

Despite sqlite being part of the Python Standard Library and is a nice and easy interface to SQLite databases, the Pandas tutorial states:
Note In order to use read_sql_table(), you must have the SQLAlchemy
optional dependency installed.
But Pandas still supports sqlite3 access if you want to avoid installing SQLAlchemy:
import sqlite3
import pandas as pd
# Create your connection.
cnx = sqlite3.connect('file.db')
df = pd.read_sql_query("SELECT * FROM table_name", cnx)
As stated here, but you need to know the name of the used table in advance.

The line
data = sqlite3.connect('data.db')
opens a connection to the database. There are no records queried up to this. So you have to execute a query afterward and provide this to the pandas DataFrame constructor.
It should look similar to this
import sqlite3
import pandas as pd
dat = sqlite3.connect('data.db')
query = dat.execute("SELECT * From <TABLENAME>")
cols = [column[0] for column in query.description]
results= pd.DataFrame.from_records(data = query.fetchall(), columns = cols)
I am not really firm with SQL commands, so you should check the correctness of the query. should be the name of the table in your database.

Parsing a sqlite .db into a dictionary of dataframes without knowing the table names:
def read_sqlite(dbfile):
import sqlite3
from pandas import read_sql_query, read_sql_table
with sqlite3.connect(dbfile) as dbcon:
tables = list(read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", dbcon)['name'])
out = {tbl : read_sql_query(f"SELECT * from {tbl}", dbcon) for tbl in tables}
return out

Search sqlalchemy, engine and database name in google (sqlite in this case):
import pandas as pd
import sqlalchemy
db_name = "data.db"
table_name = "LITTLE_BOBBY_TABLES"
engine = sqlalchemy.create_engine("sqlite:///%s" % db_name, execution_options={"sqlite_raw_colnames": True})
df = pd.read_sql_table(table_name, engine)

I wrote a piece of code up that saves tables in a database file such as .sqlite or .db and creates an excel file out of it with each table as a sheet or makes individual tables into csvs.
Note: You don't need to know the table names in advance!
import os, fnmatch
import sqlite3
import pandas as pd
#creates a directory without throwing an error
def create_dir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
print("Created Directory : ", dir)
else:
print("Directory already existed : ", dir)
return dir
#finds files in a directory corresponding to a regex query
def find(pattern, path):
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
#convert sqlite databases(.db,.sqlite) to pandas dataframe(excel with each table as a different sheet or individual csv sheets)
def save_db(dbpath=None,excel_path=None,csv_path=None,extension="*.sqlite",csvs=True,excels=True):
if (excels==False and csvs==False):
print("Atleast one of the parameters need to be true: csvs or excels")
return -1
#little code to find files by extension
if dbpath==None:
files=find(extension,os.getcwd())
if len(files)>1:
print("Multiple files found! Selecting the first one found!")
print("To locate your file, set dbpath=<yourpath>")
dbpath = find(extension,os.getcwd())[0] if dbpath==None else dbpath
print("Reading database file from location :",dbpath)
#path handling
external_folder,base_name=os.path.split(os.path.abspath(dbpath))
file_name=os.path.splitext(base_name)[0] #firstname without .
exten=os.path.splitext(base_name)[-1] #.file_extension
internal_folder="Saved_Dataframes_"+file_name
main_path=os.path.join(external_folder,internal_folder)
create_dir(main_path)
excel_path=os.path.join(main_path,"Excel_Multiple_Sheets.xlsx") if excel_path==None else excel_path
csv_path=main_path if csv_path==None else csv_path
db = sqlite3.connect(dbpath)
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(len(tables),"Tables found :")
if excels==True:
#for writing to excel(xlsx) we will be needing this!
try:
import XlsxWriter
except ModuleNotFoundError:
!pip install XlsxWriter
if (excels==True and csvs==True):
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
i=0
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
i+=1
print("Parsing Excel Sheet ",i," : ",table_name)
table.to_excel(writer, sheet_name=table_name, index=False)
print("Parsing CSV File ",i," : ",table_name)
table.to_csv(os.path.join(csv_path,table_name + '.csv'), index_label='index')
writer.save()
elif excels==True:
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
i=0
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
i+=1
print("Parsing Excel Sheet ",i," : ",table_name)
table.to_excel(writer, sheet_name=table_name, index=False)
writer.save()
elif csvs==True:
i=0
for table_name in tables:
table_name = table_name[0]
table = pd.read_sql_query("SELECT * from %s" % table_name, db)
i+=1
print("Parsing CSV File ",i," : ",table_name)
table.to_csv(os.path.join(csv_path,table_name + '.csv'), index_label='index')
cursor.close()
db.close()
return 0
save_db();

If data.db is your SQLite database and table_name is one of its tables, then you can do:
import pandas as pd
df = pd.read_sql_table('table_name', 'sqlite:///data.db')
No other imports needed.

i have stored my data in database.sqlite table name is Reviews
import sqlite3
con=sqlite3.connect("database.sqlite")
data=pd.read_sql_query("SELECT * FROM Reviews",con)
print(data)

Related

How to read a .db file in Python?

I have a excel file and want to store my excel file into a .db file. I have done that through sqlite. Now, I want to read my .db file through Python which I am unable to do as the code I have used says that the data is empty.
Below is the code:
df=pd.read_excel('filename.xlsx')
db='xyzDB'
conn=sqlite3.connect(db + '.sqlite')
c=conn.cursor()
table_list = [a for a in c.execute("SELECT name FROM sqlite_master WHERE type = 'Sheet1'")]
print(tablelist)
#another method
chunksize = 10000
for chunk in pd.read_excel('filename.xlsx', chunksize=chunksize):
chunk.columns = chunk.columns.str.replace(' ', '_') #replacing
chunk.to_sql(name='Sheet1', con=conn)
names = list(map(lambda x: x[0], c.description)) #Returns the column names
print(names)
for row in c:
print(row)
Note: have found these two codes from net and didn't understand the code. Would appreciate if you could guide me.
Try something like this ...
import pandas as pd
import sqlite3 as sq
# read csv into data frame
df=pd.read_csv('addresses.csv')
sql_data = 'addresses.sqlite'
conn = sq.connect(sql_data)
# write the data frame to the db
df.to_sql('addresses', conn, if_exists='replace', index=False)
conn.commit()
# read back from the database
print(pd.read_sql('select * from addresses', conn))
conn.close()

Why is SQLite3 Querying slower than building a new directory index with pandas

I have a script that needs all files in a given directory to be indexed for easier querying(ex: find the file with a given filename that has the most recent modified date). I have been walking through the entire directory and adding each file to a dataframe every time the script runs. I would think that instead of gathering this data for every file, It would be quicker to query a database and only update if the files modified data changed but in testing, this was way slower. Why is this? Is there a more efficient way to do this?
Result:
Index with pandas took 0:06:53.352515
Innitial index with SQLite3 took 0:43:20.042651
second index with SQLite3 took 0:21:48.863566
"""
{Description}
The purpose of this exercise is to index a directory for later use.
We will try two methods to quantify the benefits of updating a SQLite db
rather than re-indexing into a pandas dataframe each time.
"""
# Built-in/Generic Imports
import os
import sys
# […]
# Libs
import pandas as pd # Or any other
import datetime
import sqlite3
def main():
dbPath = os.path.join(os.getcwd(), 'testing.db')
indexPath = Random_Directory_with_tons_of_files
setupSQL(dbPath)
print('Index with pandas took ' + str(testPandas(indexPath)))
print('Innitial index with SQLite3 took ' + str(testSQLite(indexPath, dbPath)))
print('second index with SQLite3 took ' + str(testSQLite(indexPath, dbPath)))
def setupSQL(dbPath):
if os.path.exists(dbPath):
os.remove(dbPath)
conn = sqlite3.connect(dbPath)
c = conn.cursor()
c.execute('''CREATE TABLE testTable
(fullPath, fileName, modifiedDate)''')
conn.commit()
conn.close()
def testPandas(path):
startTime = datetime.datetime.now()
testIndex = pd.DataFrame(columns=['fullPath', 'fileName', 'modifiedDate'])
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
testIndex = testIndex.append({'fullPath' : os.path.join(dirpath, filename),
'fileName' : filename,
'modifiedDate' : os.path.getmtime(os.path.join(dirpath, filename))}, ignore_index=True)
return datetime.datetime.now() - startTime
def testSQLite(path, dbPath):
startTime = datetime.datetime.now()
conn = sqlite3.connect(dbPath)
c = conn.cursor()
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
c.execute('SELECT * FROM testTable WHERE fullPath=?', [(os.path.join(dirpath, filename))])
row = c.fetchone()
if row == None:
#File is not in database, add it
c.execute('INSERT INTO testTable VALUES (?,?,?)', [
(os.path.join(dirpath, filename)),
(filename),
(os.path.getmtime(os.path.join(dirpath, filename)))
])
conn.commit()
elif row[2] != os.path.getmtime(os.path.join(dirpath, filename)):
#Modified Date has changed, update it.
c.execute('UPDATE testTable SET modifiedDate=? WHERE fullPath=?', [(os.path.getmtime(os.path.join(dirpath, filename))),(os.path.join(dirpath, filename))])
conn.commit()
conn.close()
return datetime.datetime.now() - startTime
if __name__ == '__main__':
print('Starting')
main()
print('Done')
The biggest difference between sqlite and pandas is that sqlite is a file based database. When sqlite connects to a database it only loads certain parts of the file that holds the actual data. Once you query it for something it goes back and reads the file until it finds what it needs then returns that to you. Here is a stack overflow question asking about sqlite and how it loads data.
Sqlite
One thing you could try to see how fast it would speed up is to have sqlite create an inmemory database. Instead of passing in a path to a file for sqlite just pass in :MEMORY: and that should create a in memory sqlite databse. Then retry your benchmark and see if it improves in speed. WARNING depending on how many files you're grabbing you may use a lot of memory for this.
conn = sqlite3.connect(':memory:')

how to automatically create table based on CSV into postgres using python

I am a new Python programmer and trying to import a sample CSV file into my Postgres database using python script.
I have CSV file with name abstable1 it has 3 headers:
absid, name, number
I have many such files in a folder
I want to create a table into PostgreSQL with the same name as the CSV file for all.
Here is the code which I tried to just create a table for one file to test:
import psycopg2
import csv
import os
#filePath = 'c:\\Python27\\Scripts\\abstable1.csv'
conn = psycopg2.connect("host= hostnamexx dbname=dbnamexx user= usernamexx password= pwdxx")
print("Connecting to Database")
cur = conn.cursor()
#Uncomment to execute the code below to create a table
cur.execute("""CREATE TABLE abs.abstable1(
absid varchar(10) PRIMARY KEY,
name integer,
number integer
)
""")
#to copy the csv data into created table
with open('abstable1.csv', 'r') as f:
next(f)
cur.copy_from(f, 'abs.abstable1', sep=',')
conn.commit()
conn.close()
This is the error that I am getting:
File "c:\Python27\Scripts\testabs.py", line 26, in <module>
cur.copy_from(f, 'abs.abstable1', sep=',')
psycopg2.errors.QueryCanceled: COPY from stdin failed: error in .read() call: exceptions.ValueError Mixing iteration and read methods would lose data
CONTEXT: COPY abstable1, line 1
Any recommendation or alternate solution to resolve this issue is highly appreciated.
Here's what worked for me by: import glob
This code automatically reads all CSV files in a folder and Creates a table with Same name as of the file.
Although I'm still trying to figure out how to extract specific datatypes according to the data in CSV.
But as far as table creation is concerned, this works like a charm for all CSV files in a folder.
import csv
import psycopg2
import os
import glob
conn = psycopg2.connect("host= hostnamexx dbname=dbnamexx user= usernamexx password=
pwdxx")
print("Connecting to Database")
csvPath = "./TestDataLGA/"
# Loop through each CSV
for filename in glob.glob(csvPath+"*.csv"):
# Create a table name
tablename = filename.replace("./TestDataLGA\\", "").replace(".csv", "")
print tablename
# Open file
fileInput = open(filename, "r")
# Extract first line of file
firstLine = fileInput.readline().strip()
# Split columns into an array [...]
columns = firstLine.split(",")
# Build SQL code to drop table if exists and create table
sqlQueryCreate = 'DROP TABLE IF EXISTS '+ tablename + ";\n"
sqlQueryCreate += 'CREATE TABLE'+ tablename + "("
#some loop or function according to your requiremennt
# Define columns for table
for column in columns:
sqlQueryCreate += column + " VARCHAR(64),\n"
sqlQueryCreate = sqlQueryCreate[:-2]
sqlQueryCreate += ");"
cur = conn.cursor()
cur.execute(sqlQueryCreate)
conn.commit()
cur.close()
i tried your code and works fine
import psycopg2
conn = psycopg2.connect("host= 127.0.0.1 dbname=testdb user=postgres password=postgres")
print("Connecting to Database")
cur = conn.cursor()
'''cur.execute("""CREATE TABLE abstable1(
absid varchar(10) PRIMARY KEY,
name integer,
number integer
)
""")'''
with open('lolo.csv', 'r') as f:
next(f)
cur.copy_from(f, 'abstable1', sep=',', columns=('absid', 'name', 'number'))
conn.commit()
conn.close()
although i had to make some changes for it to work:
i had to name the table abstable1 because using abs.abstable1 postgres assumes that i'm using the schema abs, maybe you created that schema on your database if not check on that, also i'm using python 3.7
i noticed that you are using python 2.7(which i think is no longer supported), this may cause issues, since you say you are learning i would recommend that you use python 3 since it is more used now and you most likely encounter code written on it and you would have to be adapting your code to fit your python 2.7
I post my solution here based on #Rose answer.
I used sqlalchemy, a JSON file as config and glob.
import json
import glob
from sqlalchemy import create_engine, text
def create_tables_from_files(files_folder, engine, config):
try:
for filename in glob.glob(files_folder+"\*csv"):
tablename = filename.replace(files_folder, "").replace('\\', "").replace(".csv", "")
input_file = open(filename, "r")
columns = input_file.readline().strip().split(",")
create_query = 'DROP TABLE IF EXISTS ' + config["staging_schema"] + "." + tablename + "; \n"
create_query +='CREATE TABLE ' + config["staging_schema"] + "." + tablename + " ( "
for column in columns:
create_query += column + " VARCHAR, \n "
create_query = create_query[:-4]
create_query += ");"
engine.execute(text(create_query).execution_options(autocommit=True))
print(tablename + " table created")
except:
print("Error at uploading tables")

Extract data from json file python3

How can I get all the data in my image database into the database ?
My code :
import re
import json
import sqlite3
connection = sqlite3.connect('example.db')
cursor = connection.cursor()
print ("Opened database successfully");
with open('tem.txt', encoding='utf-8-sig') as json_file:
data = json.load(json_file)
for p in data:
data[p] = re.sub("<[^>]+>", "", str(data[p]))
print("%s: %s" % (p, data[p]))
I use SQLite:
Use pandas to read the Json and create a dataframe and then write it into the DB
I have added a sample code.
from sqlalchemy import create_engine
import pandas as pd
df = pd.read_json ('path\data.json')
engine = sqlalchemy.create_engine('sqlite:///my.db', echo=False)
df.to_sql('mytable', con=engine, if_exists='append')
The advantage of this is that with pandas you can make changes in the data very easily.

Losing column names while reading SQL tables using Python

I am reading very large tables (~3 times my RAM) from SQL Server and writing them as .csv in chunks. But in the .csv files the column names are missing. What am I doing wrong?
My code is as follows:
import pandas as pd
import pyodbc
cnxn = pyodbc.connect('''Driver={SQL Server}; Server=myServer; > Database=myDb''')
cursor = cnxn.cursor()
#Extracting the naems of all the tables in the database
cursor.execute("select * from information_schema.tables")
tables = cursor.fetchall()
cursor.close()
#Reading all the tables and saving them chunk by chunk as csv
counter = 1
for t in tables:
print("Currently working on table Number {0} - {1}".format(counter, t[2]))
cursor = cnxn.cursor()
cursor.execute("Select * from {0}".format(t[2]))
file_name = "{0}.csv".format(t[2])
f = open(file_name, 'w')
# Get data in batches
while True:
# Read the data
df = pd.DataFrame(cursor.fetchmany(1000))
# We are done if there are no data
if len(df) == 0:
break
# Let's write to the file
else:
df.to_csv(f, header=False)
counter += 1
f.close()
cursor.close()
cnxn.close()

Categories