I am trying to update mysql database table
So I started by creating ORM Object helping me to reduce the volume of an update query by using UPDATE, WHERE Conditions
First of all, I created an ORM variable as this ORM Object is a filtered data from dataframe by using a condition in another pd.data_frame CSV
this is my simple rule as to be easy to create conditions like this
myOutlook_inBox = pd.read_csv (r'' + mydir + 'test.CSV', usecols=
['Subject','Body', 'From: (Name)', 'To: (Name)' ], encoding='latin-1')
this is simple ORM extracted data from pd.read_csv
replaced_sbj_value = myOutlook_inBox['Subject']
.str.extract(pat='(L(?:DEL|CAI|SIN).\d{5})').dropna()
and this ORM is extracting csv.column from myOutlook_inBox['Subject']
replaced_sbj_value = myOutlook_inBox['Subject']
.str.extract(pat='(L(?:DEL|CAI|SIN).\d{5})').dropna()
myOutlook_inBox["Subject"] = replaced_sbj_value
and this is a condition that I am using to filter a specific data
frm_mwfy_to_te = myOutlook_inBox.loc[myOutlook_inBox['From:
(Name)'].str.contains("mowafy", na=False)
& myOutlook_inBox['To:(Name)'].str.contains("te",
na=False)].drop_duplicates(keep=False)
frm_mwfy_to_te.Subject
and this variable is filtered rows in mysql database in a column called Subject
filtered_data = all_data
.loc[all_data.site_code.str.contains('|'.join(frm_mwfy_to_te.Subject))]
and this is my sql query, all I need now I need to create a query that's updates column called "pending" filters in a column called "site_code" and update rows which value contains filtered_data as to update or replace values in column pending with a value TE
update_db_query = engine.execute("UPDATE govtracker SET pending = 'TE'
WHERE site_code = " + filtered_data)
I am thinking that I am on the wrong scenario any Ideas to solve this
Note: I don't need to mention the old value in my query I just want to update the value in the same row according to the the filtered data frame by the new value I mentioned in the query
For example
according to frm_mwfy_to_te.Subject as Subject is a columns name called in csv file
Let's say the output of this ORM frm_mwfy_to_te.Subject
Subject
LCAIN20804
LDELE30434
LSINI20260
and this is my whole code
from sqlalchemy import create_engine
import pandas as pd
import os
import csv
import MySQLdb
from sqlalchemy import types, create_engine
# MySQL Connection
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'Mharooney'
MYSQL_HOST_IP = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_DATABASE = 'mydb'
engine = create_engine('mysql+mysqlconnector://'+MYSQL_USER+'
:'+MYSQL_PASSWORD+'#'+MYSQL_HOST_IP+':'+str(MYSQL_PORT)+'/'+MYSQL_DATABASE,
echo=False)
#engine = create_engine('mysql+mysqldb://root:#localhost:123456/myDB?
charset=utf8mb4&binary_prefix=true', echo=False)
mydir = (os.getcwd()).replace('\\', '/') + '/'
all_data = pd.read_sql('SELECT * FROM govtracker', engine)
# .drop(['#'], axis=1)
myOutlook_inBox = pd.read_csv(r'' + mydir + 'test.CSV', usecols=['Subject',
'Body', 'From: (Name)', 'To: (Name)'],
encoding='latin-1')
myOutlook_inBox.columns = myOutlook_inBox.columns.str.replace(' ', '')
#this object extract 5 chars and 5 numbers from specific column in csv
replaced_sbj_value = myOutlook_inBox['Subject'].str.extract(pat='(L(?:DEL|CAI|SIN).\d{5})').dropna()
#this columns I want to filter in database
myOutlook_inBox["Subject"] = replaced_sbj_value
# this conditions filters and get and dublicate repeated data from outlook
exported file
# Condition 1 any mail from mowafy to te
frm_mwfy_to_te = myOutlook_inBox.loc[myOutlook_inBox['From:
(Name)'].str.contains("mowafy", na=False)
& myOutlook_inBox['To:
(Name)'].str.contains("te", na=False)].drop_duplicates(
keep=False)
frm_mwfy_to_te.Subject
filtered_data = all_data.loc[all_data.site_code.str.contains
('|'.join(frm_mwfy_to_te.Subject))]
print(myOutlook_inBox)
all_data.replace('\n', '', regex=True)
df = all_data.where((pd.notnull(all_data)), None)
print(df)
print("Success")
print(frm_mwfy_to_te.Subject)
print(filtered_data)
# rows = engine.execute("SELECT * FROM govtracker")#.fetchall()
# print(rows)
update_db_query = engine.execute("UPDATE govtracker SET pending = 'TE'
WHERE site_code = " + filtered_data)
"""engine = create_engine('postgresql+psycopg2://user:pswd#mydb')
df.to_sql('temp_table', engine, if_exists='replace')"""
# select_db_query = pd.read_sql("SELECT * FROM govtracker", con = engine)
#print(update_db_query)
Now let's say this is the output of my ORM then I will use this ORM as to filter and get the row of these three values from mysql database as to update every row contains these values and I want to update columns called Pending and pending status in my sql
and this is my database query
CREATE TABLE `mydb`.`govtracker` (
`id` INT,
`site_name` VARCHAR(255),
`region` VARCHAR(255),
`site_type` VARCHAR(255),
`site_code` VARCHAR(255),
`tac_name` VARCHAR(255),
`dt_readiness` DATE,
`rfs` VARCHAR(255),
`rfs_date` DATE,
`huawei_1st_submission_date` DATE,
`te_1st_submission_date` DATE,
`huawei_2nd_submission_date` DATE,
`te_2nd_submission_date` DATE,
`huawei_3rd_submission_date` DATE,
`te_3rd_submission_date` DATE,
`acceptance_date_opt` DATE,
`acceptance_date_plan` DATE,
`signed_sites` VARCHAR(255),
`as_built_date` DATE,
`as_built_status` VARCHAR(255),
`date_dt` DATE,
`dt_status` VARCHAR(255),
`shr_status` VARCHAR(255),
`dt_planned` INT(255),
`integeration_status` VARCHAR(255),
`comments_snags` LONGTEXT,
`cluster_name` LONGTEXT,
`type_standalone_colocated` VARCHAR(255),
`installed_type_standalone_colocated` VARCHAR(255),
`status` VARCHAR(255),
`pending` VARCHAR(255),
`pending_status` LONGTEXT,
`problematic_details` LONGTEXT,
`ets_tac` INT(255),
`region_r` VARCHAR(255),
`sf6_signed_date` DATE,
`sf6_signed_comment` LONGTEXT,
`comment_history` LONGTEXT,
`on_air_owner` VARCHAR(255),
`pp_owner` VARCHAR(255),
`report_comment` LONGTEXT,
`hu_opt_area_owner` VARCHAR(255),
`planning_owner` VARCHAR(255),
`po_number` VARCHAR(255),
`trigger_date` DATE,
`as_built_status_tr` VARCHAR(255)
) ENGINE = InnoDB;
Another Important note:
In excel while I using filter in some column it shows the all values in the column I selected lets to say Pending is the column I've selected which have values Accepted & PAC in progress Planning TE PP DT FM Rollout Integration Opt Team
So now all the rest columns have values like this
So should I have to create a table something like columns_values and fill this table with all these values I have, as these values are static values
It is easy to solve my case
Last Note: This database is according to an existing xlsm file but I push the data from xlsm to mysql and now mysql Is my main database, not the excel formats but I am updating mysql database through csv file not in my database the orm object frm_mwfy_to_te.Subject is an extracted data from the data frame in the csv file
Any Ideas Here?
I hope everything is clear enough
Is this material could help me or not?
https://auth0.com/blog/sqlalchemy-orm-tutorial-for-python-developers/#SQLAlchemy-ORM
It's called TL;DR
Important Note: the value of fitered data is actually as pandas Dataframe but for one column only from CSV file because I want to filter with this dataframe column values like I posted before to update some columns in my database I just started with updating one column called pending one as to see the result after that I'll update the other columns by the way the script the I want to create that to search in my database with this values in filtered data for an example I have a value called LCAIN20804 I want to take this value and to filter in database table then go the column called Huawei 1st submission date if it wasn't filled then fill with current data if it was filled go to pending column and replace the old value with TE then go to pending_status and replace the old value with waiting TE acceptance and so on that's a small part of my script I want to create
I hope this is clear enough
If you want to turn a pandas DataFrame into a SQL update statement, it may be nice to first transform it into a list of tuples, where the tuples are the new column values, and then use engine.executemany (https://stackoverflow.com/a/27743541/5015356)
values = [tuple(x) for x in filtered_data.values]
query = """
UPDATE govtracker
SET pending = 'TE'
WHERE site_code = '%s')
"""
connection = engine.connect()
update_db_query = connection.execute(query, values)
For each tuple (<sitecode>), this will execute the update statement. If you want to update more columns or expand the where clause, just add the additional columns to filtered_data, and add a new %s where you want the other value to appear.
Just make sure you keep the columns in the correct order!
Related
I created a variable that stores patient ID and a count of the number of missed appointments per patient. I created a table with SQLite and I am trying to store my variable into my created table but I am getting an error of "ValueError: parameters are of unsupported type". Here is my code so far:
import pandas as pd
import sqlite3
conn = sqlite3.connect('STORE')
c = conn.cursor()
c.execute("DROP TABLE IF EXISTS PatientNoShow")
c.execute("""CREATE TABLE IF NOT EXISTS PatientNoShow ("PatientId" text, "No-show" text)""")
df = pd.read_csv(r"C:\missedappointments.csv")
df2 = df[df['No-show']=="Yes"]
pt_counts = df2["PatientId"].value_counts()
c.executemany("INSERT OR IGNORE INTO PatientNoShow VALUES (?, ?)", pt_counts)
Thank you in advance for any help! Still learning, so any kind of "explain to me like I'm 5" answers will be appreciated! Also, once I create my tables and store info in them, how would I print or get a visual of the output?
You wrote that the two variables are of type text in
c.execute("""CREATE TABLE IF NOT EXISTS PatientNoShow ("PatientId" text, "No-show" text)""")
but pt_counts contains integers because it counts the values in the column PatientId, besides .executemany() needs a sequence to work properly.
This piece of code should work if PatientId is of string type:
import pandas as pd
import sqlite3
conn = sqlite3.connect('STORE')
c = conn.cursor()
c.execute("DROP TABLE IF EXISTS PatientNoShow")
c.execute("""CREATE TABLE IF NOT EXISTS PatientNoShow ("PatientId" text, "No-show" integer)""") # type changed
df = pd.read_csv(r"C:/Users/bob/Desktop/Trasporti_project/Matchings_locations/norm_data/standard_locations.csv")
pt_counts = df["standard_name"].value_counts()
c.executemany("INSERT OR IGNORE INTO PatientNoShow VALUES (?, ?)", pt_counts.iteritems()) # this is a sequence
I am performing an ETL task where I am querying tables in a Data Warehouse to see if it contains IDs in a DataFrame (df) which was created by joining tables from the operational database.
The DataFrame only has ID columns from each joined table in the operational database. I have created a variable for each of these columns, e.g. 'billing_profiles_id' as below:
billing_profiles_dim_id = df['billing_profiles_dim_id']
I am attempting to iterated row by row to see if the ID here is in the 'billing_profiles_dim' table of the Data Warehouse. Where the ID is not present, I want to populate the DWH tables row by row using the matching ID rows in the ODB:
for key in billing_profiles_dim_id:
sql = "SELECT * FROM billing_profiles_dim WHERE id = '"+str(key)+"'"
dwh_cursor.execute(sql)
result = dwh_cursor.fetchone()
if result == None:
sqlQuery = "SELECT * from billing_profile where id = '"+str(key)+"'"
sqlInsert = "INSERT INTO billing_profile_dim VALUES ('"+str(key)+"','"+billing_profile.name"')
op_cursor = op_connector.execute(sqlInsert)
billing_profile = op_cursor.fetchone()
So far at least, I am receiving the following error:
SyntaxError: EOL while scanning string literal
This error message points at the close of barcket at
sqlInsert = "INSERT INTO billing_profile_dim VALUES ('"+str(key)+"','"+billing_profile.name"')
Which I am currently unable to solve. I'm also aware that this code may run into another problem or two. Could someone please see how I can solve the current issue and please ensure that I head down the correct path?
You are missing a double tick and a +
sqlInsert = "INSERT INTO billing_profile_dim VALUES ('"+str(key)+"','"+billing_profile.name+"')"
But you should really switch to prepared statements like
sql = "SELECT * FROM billing_profiles_dim WHERE id = '%s'"
dwh_cursor.execute(sql,(str(key),))
...
sqlInsert = ('INSERT INTO billing_profile_dim VALUES '
'(%s, %s )')
dwh_cursor.execute(sqlInsert , (str(key), billing_profile.name))
I have a massive table (over 100B records), that I added an empty column to. I parse strings from another field (string) if the required string is available, extract an integer from that field, and want to update it in the new column for all rows that have that string.
At the moment, after data has been parsed and saved locally in a dataframe, I iterate on it to update the Redshift table with clean data. This takes approx 1sec/iteration, which is way too long.
My current code example:
conn = psycopg2.connect(connection_details)
cur = conn.cursor()
clean_df = raw_data.apply(clean_field_to_parse)
for ind, row in clean_df.iterrows():
update_query = build_update_query(row.id, row.clean_integer1, row.clean_integer2)
cur.execute(update_query)
where update_query is a function to generate the update query:
def update_query(id, int1, int2):
query = """
update tab_tab
set
clean_int_1 = {}::int,
clean_int_2 = {}::int,
updated_date = GETDATE()
where id = {}
;
"""
return query.format(int1, int2, id)
and where clean_df is structured like:
id . field_to_parse . clean_int_1 . clean_int_2
1 . {'int_1':'2+1'}. 3 . np.nan
2 . {'int_2':'7-0'}. np.nan . 7
Is there a way to update specific table fields in bulk, so that there is no need to execute one query at a time?
I'm parsing the strings and running the update statement from Python. The database is stored on Redshift.
As mentioned, consider pure SQL and avoid iterating through billions of rows by pushing the Pandas data frame to Postgres as a staging table and then run one single UPDATE across both tables. With SQLAlchemy you can use DataFrame.to_sql to create a table replica of data frame. Even add an index of the join field, id, and drop the very large staging table at end.
from sqlalchemy import create_engine
engine = create_engine("postgresql+psycopg2://myuser:mypwd!#myhost/mydatabase")
# PUSH TO POSTGRES (SAME NAME AS DF)
clean_df.to_sql(name="clean_df", con=engine, if_exists="replace", index=False)
# SQL UPDATE (USING TRANSACTION)
with engine.begin() as conn:
sql = "CREATE INDEX idx_clean_df_id ON clean_df(id)"
conn.execute(sql)
sql = """UPDATE tab_tab t
SET t.clean_int_1 = c.int1,
t.clean_int_2 = c.int2,
t.updated_date = GETDATE()
FROM clean_df c
WHERE c.id = t.id
"""
conn.execute(sql)
sql = "DROP TABLE IF EXISTS clean_df"
conn.execute(sql)
engine.dispose()
I am using pandas and a Dataframe to deal with some data. I want to load the data into a mySQL dabase where one of the fields is a Point.
In the file I am parsing with python I have the lat and lon of the points.
I have created a dataframe (df) with the point information (id and coords):
id coords
A GeomFromText( ' POINT(40.87 3.80) ' )
I have saved in coords the command required in mySQL to create a Point from the text. However, when executing:
from sqlalchemy import create_engine
engine = create_engine(dbconnection)
df.to_sql("point_test",engine, index=False, if_exists="append")
I got the following error:
DataError: (mysql.connector.errors.DataError) 1416 (22003): Cannot get
geometry object from data you send to the GEOMETRY field
Triggered because df.to_sql transforms the GeomFromText( ' POINT(40.87
3.80) ' ) into string as "GeomFromText( ' POINT(40.87 3.80) ' )" when it should be the execution of the function GeomFromText in mySQL.
Does anyone has a suggestion about how to insert in mySQL geometrical fields information originally in text form using pandas dataframe?
A work around is to create a temporary table with the String of the geometrical information that need to be added and then update the point_test table with a call to ST_GeomFromText from the temporary table.
Assuming database with table point_test with id (VARCHAR(5)) and coords(POINT):
a.Create dataframe df as an example with point "A" and "B"
dfd = np.array([['id','geomText'],
["A","POINT( 50.2 5.6 )"],
["B","POINT( 50.2 50.4 )"]])
df=pd.DataFrame(data=dfd[1:,:], columns=dfd[0,:])
b.Add point "A" and "B" into point_test but only the id and add the string "geomText" into the table temp_point_test
df[['id']].to_sql("point_test",engine, index=False, if_exists="append")
df[['id', 'geomText']].to_sql("temp_point_test",engine, index=False, if_exists="append")
c. Update table point_test with the point from table temp_point_test applying the ST_GeomFromText() to the select. Finally, drop temp_point_test:
conn = engine.connect()
conn.execute("update point_test pt set pt.coords=(select ST_GeomFromText(geomText) from temp_point_test tpt "+
"where pt.id=tpt.id)")
conn.execute("drop table temp_point_test")
conn.close()
I'm working on a project that requires a column in postgresql to be updated by the Mapbox geocoding api to convert an address into lon,lat coordinates. I created a FOR loop to read in the address from each row. I'd like to then save the unique lon,lat coordinates created into the "coordinates" column.
However, the code I've written updates the entire "coordinates" column with the first row's lon,lat coordinates, rather than iterating and updating each row's "coordinates" column individually.
Where did I go wrong? Any help would be greatly appreciated.
Main Code
import psycopg2
import json
from psycopg2.extras import RealDictCursor
import sys
from mapbox import Geocoder
from mapboxgeocode import getCoord
import numpy as np
con = None
try:
con = psycopg2.connect(database='database', user='username')
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS permits")
cur.execute("""CREATE TABLE permits(issued_date DATE, address
VARCHAR(200), workdesc VARCHAR(600),permit_type VARCHAR(100), permit_sub_type
VARCHAR(100), anc VARCHAR(4), applicant VARCHAR(100),owner_name
VARCHAR(200))""")
cur.execute(""" COPY permits FROM '/path/to/csv/file'
WITH DELIMITER ',' CSV HEADER """)
cur.execute("""ALTER TABLE permits ADD COLUMN id SERIAL PRIMARY KEY;
UPDATE permits set id = DEFAULT;""")
cur.execute("""ALTER TABLE permits ADD COLUMN coordinates VARCHAR(80);
UPDATE permits SET coordinates = 4;""")
cur.execute("""ALTER TABLE permits ADD COLUMN city VARCHAR(80);
UPDATE permits SET city = 'Washington,DC'; ALTER TABLE permits ALTER
COLUMN city SET NOT NULL;""")
cur.execute("UPDATE permits SET address = address || ' ' || city;")
cur.execute("SELECT * FROM permits;")
for row in cur.fetchall():
test = row[1]
help = getCoord(test)
cur.execute("UPDATE permits SET coordinates = %s;", (help,) )
print(test)
con.commit()
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
cur.close()
con.commit()
con.close()
Geocode Function
from mapbox import Geocoder
import numpy as np
def getCoord(address):
geocoder = Geocoder(access_token='xxxxxxxxxxxxxxxx')
response = geocoder.forward(address)
first = response.geojson()['features'][0]
row = first['geometry']['coordinates']
return row
You need to add a WHERE condition in your UPDATE statement. Without WHERE, SQL simply thinks you want to update all of the coordinates columns. A proper WHERE condition will let it know specifically which cell in the column it needs to modify.
You'll probably want to use your primary key, as it's a unique identifier. Perhaps a statement along the lines of:
cur.execute("UPDATE permits SET coordinates = %s WHERE id = %s;", (help, row[index of the id column]) )
I think the row index you need would be row[8], but you'll have to confirm that in your code. I hope that gets it working.