Insert dataframe columns to different tables with sqlAlchemy - python

I got a number of dataframes with information associated with different systems.
Now I'm trying to write the inforamtion to multiple tables (number of systems I got) by using sqlAlchemy.
(I'm pretty new to python and sqlAlchemy tho)
So I'm wondering if theres a nicer possibility to write the values of each column of the dataframe to DIFFERENT tables?
E.g. Column 1 of dataframe 3, 4 to table 1, column 2 of dataframe 3, 4 to table 2, and so on.
Also I keep getting the integrity error "Duplicaty entry" if any values are written twice to the same column in the table.
x = 0
for index in a_id:
table_sim = Table(
f'simulated_for_sys_{np.int_(index)}', meta,
Column('timestamp', DateTime, primary_key = True),
Column('system__id', Integer),
Column('simulated_yield_in_kWh', Float),
Column('global_irradiance_tilted_in_kWh_per_m2', Float)
)
# Checking if table already exist
if not engine.dialect.has_table(engine, f'simulated_for_sys_{np.int_(index)}'):
print("Tables created", table_sim)
# Specified table
meta.create_all(engine)
else:
print("Table already exists...")
conn = engine.connect()
# Write timestamps from 01.01.xxxx till now
for timestamp_utc in timestamp_df['timestamp_utc']:
print(timestamp_utc)
ins = table_sim.insert().values(timestamp = timestamp_utc.to_pydatetime() )
result = conn.execute(ins)
# Write id to table (giving duplicate error..)
for system_id in sys_ids_df['system_id']:
ins1 = table_sim.insert().values(system__id = system_id)
conn = engine.connect()
result = conn.execute(ins1)
# Write pr information from column x of dataframeto table (also duplicate error in between if
# same values appear)
colname = f'pr_{x}'
for colname in pr_daily_df[f'{colname}']:
ins2 = table_sim.insert().values(simulated_yield_in_kWh = colname)
result = conn.execute(ins2)
# Write rad information from column x of dataframe to table (also duplicate error in between if
# same values
appear)
colname = f'rad_{x}'
for colname in rad_daily_df[f'{colname}']:
ins3 = table_sim.insert().values(global_irradiance_tilted_in_kWh_per_m2 = colname)
result = conn.execute(ins3)
x += 1

Related

How to perform an SQL update on multiple rows using Python

I am retrieving data from a Postgresdb and storing it in a Pandas dataframe for further processing. While doing that I want to update the queried table and set a flag saying that these rows are getting processed.
engine = engine = create_engine(connection_string, connect_args=credentials)
query = load_query(filename='queries/get_data.sql')
df = pd.read_sql(query, engine)
ids = df['id']
update_query = "update table1" +\
"set status = 'processing'," +\
f"where session_id in ({ids})"
with engine.connect() as con:
rs = con.execute(update_query)
The dataframe then looks like this:
ID
descr
Cell 1
Cell 2
Cell 3
Cell 4
Now I want to update the column "status". What do I need to do? I know I need a list, separeted by commas and each value in qoutes... But I wasnt able to build id.
Help appreciated

Unable to write result to Oracle Database from Python using cx_oracle library

I am trying to write result into Oracle Database using executemany command. However, I am neither getting any error message nor the database table is getting updated.
Using same connection object and cursor I am able to connect to database, extract data and insert the data into new table. However, I am NOT able to update the result to a existing table
I am using Oracle 19c database , python 3.8.8 and cx_oracle 8.0.0.
Reading the file from Oracle database table 'bench_project_d'. To reproduce the error I have created a csv file and reading the data from csv file.
The data has 7 fields
ROW_ID type: NUMBER(19,0),
GROUP_ID type: NUMBER(19,0),
PLANNED_UNITS type: NUMBER,
IQR_PTU(Calculated on the fly),
Q1_PTU (Calculated on the fly) ,
Q2_PTU (Calculated on the fly) ,
ANOMALY type: NUMBER
All the fields are having data except the new column "ANOMALY".
In this field , all are null values. This is a field where we want to store the results.
While importing the data to python, we are taking first 6 feature, calculate the anomaly field and push the anomaly result to database
'''python
#connecting to Database
username="**********"
password="********"
tsn="*********"
conn = cx_Oracle.connect(username, password, tsn)
cur = conn.cursor()
global_df = pd.read_csv("project_data.csv")
#Filtering the group having more than 3 projects
grouped = global_df.groupby("GROUP_ID")
filtered_df = grouped.filter(lambda x: x["GROUP_ID"].count()>3)
filtered_df
#Handling zero Interquartile Range
x = filtered_df[filtered_df['IQR_PTU'] == 0]['GROUP_ID'].unique()
for i in x:
filtered_df.loc[filtered_df['GROUP_ID'] == i,'IQR_PTU'] = (filtered_df[filtered_df['GROUP_ID']==i]['PLANNED_UNITS'].quantile(0.95)) - (filtered_df[filtered_df['GROUP_ID']==i]['PLANNED_UNITS'].quantile(0.05))
#Creating the output 'Anomaly' field
filtered_df['ANOMALY'] =0
filtered_df.loc[filtered_df['PLANNED_UNITS'] > (filtered_df['Q2_PTU']+(1.5*filtered_df['IQR_PTU'])),'ANOMALY']=1
filtered_df.loc[filtered_df['PLANNED_UNITS'] < (filtered_df['Q1_PTU']-(1.5*filtered_df['IQR_PTU'])),'ANOMALY']=-1
#Formating the Dataframe
result_df = df.loc[:,['ROW_ID','GROUP_ID', 'ANOMALY']]
result_df = result_df.round(2)
result_df=result_df.astype('object')
value_to_stre_db=result_df.values.tolist()
#Pushing the result to Database table bench_project_d
statement = 'update bench_project_d set GROUP_ID = :2, ANOMALY= :3 where ROW_ID = :1'
cur.executemany(statement, value_to_stre_db)
conn.commit()
EDIT 1:
I have tried to convert list of array to list of tuples and executed the same code again. But still no luck.
rows = [tuple(x) for x in value_to_stre_db]
#Pushing the result to Database table bench_project_d
statement = ''update bench_project_d set GROUP_ID = :2, ANOMALY= :3 where ROW_ID = :1''
cur.executemany(statement, rows)
conn.commit()

Obtain list of IDs inserted from pandas to_sql function

The following Python code successfully appends the rows belonging to the pandas dataframe into an MS SQL table via the SqlAlchemy engine previously configured.
df.to_sql(schema='stg', name = 'TEST', con=engine, if_exists='append', index=False)
I want to obtain the auto-generated IDs numbers for each of the rows inserted into the stg.Test table. In other words, what is the SqlAlchemy equivalent to the Sql Server OUTPUT clause during an INSERT statement
Unfortunately, there is no easy solution to your problem like an additional parameter in your statement. You have to use the behavior that new rows get the highest id + 1 assigned. With this knowledge, you can calculate the ids of all your rows.
Option 1: Explained in this answer. You select the current maximum id, before the insert statement. Then, you assign ids to all the entries in your DataFrame greater than the previous maximum. Lastly, insert the df which already includes the ids.
Option 2: You insert the DataFrame and then acquire the highest id. With the number of entries inserted you can calculate the id of all entries. This is how such an insert function could look like:
def insert_df_and_return_ids(df, engine):
# It is important to use same connection for both statements if
# something like last_insert_rowid() is used
conn = engine.connect()
# Insert the df into the database
df.to_sql('students', conn, if_exists='append', index=False)
# Aquire the maximum id
result = conn.execute('SELECT max(id) FROM students') # Should work for all SQL variants
# result = conn.execute('Select last_insert_rowid()') # Specifically for SQLite
# result = conn.execute('Select last_insert_id()') # Specifically for MySql
entries = df.shape[0]
last_id = -1
# Iterate over result to get last inserted id
for row in result:
last_id = int(str(row[0]))
conn.close()
# Generate list of ids
list_of_ids = list(range(last_id - entries + 1, last_id + 1))
return list_of_ids
PS: I could not test the function on an MS SQL server, but the behavior should be the same. In order to test if everything behaves as it should you can use this:
import numpy as np
import pandas as pd
import sqlalchemy as sa
# Change connection to MS SQL server
engine = sa.create_engine('sqlite:///test.lite', echo=False)
# Create table
meta = sa.MetaData()
students = sa.Table(
'students', meta,
sa.Column('id', sa.Integer, primary_key = True),
sa.Column('name', sa.String),
)
meta.create_all(engine)
# DataFrame to insert with two entries
df = pd.DataFrame({'name': ['Alice', 'Bob']})
ids = insert_df_and_return_ids(df, engine)
print(ids) # [1,2]
conn = engine.connect()
# Insert any entry with a high id in order to check if new ids are always the maximum
result = conn.execute("Insert into students (id, name) VALUES (53, 'Charlie')")
conn.close()
# Insert data frame again
ids = insert_df_and_return_ids(df, engine)
print(ids) # [54, 55]
EDIT: If multiple threads are utilized, transactions can be used to make the option thread-safe at least for SQLite:
conn = engine.connect()
transaction = conn.begin()
df.to_sql('students', conn, if_exists='append', index=False)
result = conn.execute('SELECT max(id) FROM students')
transaction.commit()

Populating Table with Values from other table if ID not in DWH table

I am performing an ETL task where I am querying tables in a Data Warehouse to see if it contains IDs in a DataFrame (df) which was created by joining tables from the operational database.
The DataFrame only has ID columns from each joined table in the operational database. I have created a variable for each of these columns, e.g. 'billing_profiles_id' as below:
billing_profiles_dim_id = df['billing_profiles_dim_id']
I am attempting to iterated row by row to see if the ID here is in the 'billing_profiles_dim' table of the Data Warehouse. Where the ID is not present, I want to populate the DWH tables row by row using the matching ID rows in the ODB:
for key in billing_profiles_dim_id:
sql = "SELECT * FROM billing_profiles_dim WHERE id = '"+str(key)+"'"
dwh_cursor.execute(sql)
result = dwh_cursor.fetchone()
if result == None:
sqlQuery = "SELECT * from billing_profile where id = '"+str(key)+"'"
sqlInsert = "INSERT INTO billing_profile_dim VALUES ('"+str(key)+"','"+billing_profile.name"')
op_cursor = op_connector.execute(sqlInsert)
billing_profile = op_cursor.fetchone()
So far at least, I am receiving the following error:
SyntaxError: EOL while scanning string literal
This error message points at the close of barcket at
sqlInsert = "INSERT INTO billing_profile_dim VALUES ('"+str(key)+"','"+billing_profile.name"')
Which I am currently unable to solve. I'm also aware that this code may run into another problem or two. Could someone please see how I can solve the current issue and please ensure that I head down the correct path?
You are missing a double tick and a +
sqlInsert = "INSERT INTO billing_profile_dim VALUES ('"+str(key)+"','"+billing_profile.name+"')"
But you should really switch to prepared statements like
sql = "SELECT * FROM billing_profiles_dim WHERE id = '%s'"
dwh_cursor.execute(sql,(str(key),))
...
sqlInsert = ('INSERT INTO billing_profile_dim VALUES '
'(%s, %s )')
dwh_cursor.execute(sqlInsert , (str(key), billing_profile.name))

Use temp table with SQLAlchemy

I am trying to use use a temp table with SQLAlchemy and join it against an existing table. This is what I have so far
engine = db.get_engine(db.app, 'MY_DATABASE')
df = pd.DataFrame({"id": [1, 2, 3], "value": [100, 200, 300], "date": [date.today(), date.today(), date.today()]})
temp_table = db.Table('#temp_table',
db.Column('id', db.Integer),
db.Column('value', db.Integer),
db.Column('date', db.DateTime))
temp_table.create(engine)
df.to_sql(name='tempdb.dbo.#temp_table',
con=engine,
if_exists='append',
index=False)
query = db.session.query(ExistingTable.id).join(temp_table, temp_table.c.id == ExistingTable.id)
out_df = pd.read_sql(query.statement, engine)
temp_table.drop(engine)
return out_df.to_dict('records')
This doesn't return any results because the insert statements that to_sql does don't get run (I think this is because they are run using sp_prepexec, but I'm not entirely sure about that).
I then tried just writing out the SQL statement (CREATE TABLE #temp_table..., INSERT INTO #temp_table..., SELECT [id] FROM...) and then running pd.read_sql(query, engine). I get the error message
This result object does not return rows. It has been closed automatically.
I guess this is because the statement does more than just SELECT?
How can I fix this issue (either solution would work, although the first would be preferable as it avoids hard-coded SQL). To be clear, I can't modify the schema in the existing database—it's a vendor database.
In case the number of records to be inserted in the temporary table is small/moderate, one possibility would be to use a literal subquery or a values CTE instead of creating temporary table.
# MODEL
class ExistingTable(Base):
__tablename__ = 'existing_table'
id = sa.Column(sa.Integer, primary_key=True)
name = sa.Column(sa.String)
# ...
Assume also following data is to be inserted into temp table:
# This data retrieved from another database and used for filtering
rows = [
(1, 100, datetime.date(2017, 1, 1)),
(3, 300, datetime.date(2017, 3, 1)),
(5, 500, datetime.date(2017, 5, 1)),
]
Create a CTE or a sub-query containing that data:
stmts = [
# #NOTE: optimization to reduce the size of the statement:
# make type cast only for first row, for other rows DB engine will infer
sa.select([
sa.cast(sa.literal(i), sa.Integer).label("id"),
sa.cast(sa.literal(v), sa.Integer).label("value"),
sa.cast(sa.literal(d), sa.DateTime).label("date"),
]) if idx == 0 else
sa.select([sa.literal(i), sa.literal(v), sa.literal(d)]) # no type cast
for idx, (i, v, d) in enumerate(rows)
]
subquery = sa.union_all(*stmts)
# Choose one option below.
# I personally prefer B because one could reuse the CTE multiple times in the same query
# subquery = subquery.alias("temp_table") # option A
subquery = subquery.cte(name="temp_table") # option B
Create final query with the required joins and filters:
query = (
session
.query(ExistingTable.id)
.join(subquery, subquery.c.id == ExistingTable.id)
# .filter(subquery.c.date >= XXX_DATE)
)
# TEMP: Test result output
for res in query:
print(res)
Finally, get pandas data frame:
out_df = pd.read_sql(query.statement, engine)
result = out_df.to_dict('records')
You can try to use another solution - Process-Keyed Table
A process-keyed table is simply a permanent table that serves as a
temp table. To permit processes to use the table simultaneously, the
table has an extra column to identify the process. The simplest way to
do this is the global variable ##spid (##spid is the process id in SQL
Server).
...
One alternative for the process-key is to use a GUID (data type
uniqueidentifier).
http://www.sommarskog.se/share_data.html#prockeyed

Categories