My pipeline is duplicating the same table in postgresql

My pipeline is duplicating the same table in postgresql - python

The following function is supposed to create a table in postgresql, but if I execute it with the same name in the SQL Script and df.to_sql('spotify'...) it freezes and does nothing, and when I change the name for either of the two statements, it creates 2 different tables in my db. Something is telling me I should not be creating 2 connections (conn * and engine) but I'm not sure about this.
I want to be able to create my own table to then append it to my df...
I'd highly appreciate some feedback.
def extraction():
TOKEN = 'BQC1Cas7Nj6T61Gkq7ufKa2e6MKjNXjembypav0wsMuEVATyZSZRbgELPXR1i12Qzz8doLck1cueDIn-uqp0EcvyYeVHnFIEGb4MkCjgmIl8975UIDkCvP9WTBzUDHok1RmuQw6ySeHMkREuY-KtWm367yopkyBWQYuR28It'
#We need headers to send the information along with our request, so this should be part of our request.
headers = {
"Accept":"application/json",
"Content-Type":"application/json",
"Authorization":"Bearer {token}".format(token=TOKEN)
}
r = requests.get("https://api.spotify.com/v1/me/player/recently-played",headers = headers)
response = r.json()
if 'error' in response:
print('The TOKEN is either wrong or has expired')
else:
#if my response went smoothly, then we proceed to extract and loop through my .json dictionary and get the values from it.
my_song_list = []
global df
for song in response['items']:
artist_id = song['track']['artists'][0]['id']
artist_name = song['track']['artists'][0]['name']
artist_link = song['track']['artists'][0]['external_urls']['spotify']
album_id = song['track']['album']['id']
album_name = song['track']['album']['name']
album_link = song['track']['album']['external_urls']['spotify']
song_id = song['track']['id']
song_name = song['track']['name']
song_link = song['track']['external_urls']['spotify']
duration_ms = song['track']['duration_ms']
popularity = song['track']['popularity']
disc_number = song['track']['disc_number']
played_at = song['played_at'].split(".")[0]
song_dic = {'artist_id': artist_id,
'artist_name':artist_name,
'artist_link':artist_link,
'album_id':album_id,
'album_name':album_name,
'album_link':album_link,
'song_id':song_id,
'song_name':song_name,
'song_link':song_link,
'duration_ms':duration_ms,
'popularity':popularity,
'disc_number':disc_number,
'played_at':played_at
}
my_song_list.append(song_dic) #now, in order to convert my DICTIONARY to a DATAFRAME, I should consider appending it to a LIST first.
df = pd.DataFrame(my_song_list) #now that all my songs are in a LIST datatype, I can convert it to a dataframe.
#This is a basic transformation performed in my dataframe:
#Re-ordering columns in my df
df = df[["artist_id","artist_link","album_id","album_name","album_link","song_id","song_name","song_link","duration_ms","popularity","disc_number","played_at"]]
#Creating two columns (date, time) by spliting the played_at column.
df[['date','time']] = df['played_at'].str.split('T',expand=True)
# #Right now, played_at, date & time are objects, so we need to change these to timestamp.
df['date'] = pd.to_datetime(df['date'])
df['time'] = pd.to_datetime(df['time'])
df['played_at'] = pd.to_datetime(df['played_at'])
df['played_at'] = df['played_at'].dt.tz_localize('US/Central')
return df
def loading():
#psycopg2 is only used when connecting to a PostgreSQL Database, so we first make contact by setting some basic info.
conn = psycopg2.connect(host='127.0.0.1',port='5432',dbname='Athenas',user='postgres',password='cis15a')
#Creating a cursor to display my PostgreSQL Version
cur = conn.cursor()
print('=============================================================')
print('Connected to Athenas')
print('PostgreSQL database version:')
print("=============================================================")
cur.execute('SELECT version()')
db_version = cur.fetchone()
print(db_version)
#Creating the "Spotify_API" table in PostgreSQL using the psycopg2 library.
table_py = """
CREATE TABLE IF NOT EXISTS spotify(
unique_identifier SERIAL PRIMARY KEY,
artist_link VARCHAR(255) NOT NULL,
album_id VARCHAR(255) NOT NULL,
album_name VARCHAR(255) NULL,
album_link VARCHAR(255) NOT NULL,
song_id VARCHAR(255) NOT NULL,
song_name VARCHAR(255) NOT NULL,
song_link VARCHAR(255) NOT NULL,
duration INT NOT NULL,
popularity INT NULL,
disc_number INT NULL,
played_at TIMESTAMP NOT NULL,
date DATE NOT NULL,
time TIME NOT NULL
)
"""
#Executing my "table" using my cur variable.
try:
cur.execute(table_py)
print("=============================================================")
print("=============================================================")
print("=============================================================")
print("All good")
print("=============================================================")
print("=============================================================")
print("=============================================================")
except Exception as e:
print("An error occurred when initializing the database")
print("=============================================================")
print("=============================================================")
print("=============================================================")
#In order to load my existing dataframe to the table we previously created using the psycopg2 library,
#we now need to create an engine using SQLALCHEMY and APPEND my dataframe to the spotify_API Table.
engine = sa.create_engine('postgresql://postgres:cis15a#localhost:5432/Athenas')
df.to_sql('spotify', con = engine, index=False, if_exists='append')
print("=============================================================")
print("=============================================================")
print('The ETL ran succesfully')
print("=============================================================")
print("=============================================================")
cur.close()
conn.commit()

You execute 2 times the create table statement:
cur.execute(table_py)
if cur.execute(table_py) == True:
What you wanted to do was:
try:
cur.execute(table_py)
print("All good")
except Exception as e:
print("An error occurred when initializing the database")

Related

Database ER diagram not showing relationships even though specified

I have created a sqlite database. Even though I have included the the relationship between the primary and foreign keys, when I am generating the ER diagram I am not able to see the connections between them. I am using datagrip to create the diagram. I tested other databases in datagrip and dbvisualizer and i do not have any problems with them but only in this.
ER diagram -
This is the script i used for creating two tables in the database -
def create_titles_table():
# connect to the database
conn = sqlite3.connect("imdb.db")
# create a cursor
c = conn.cursor()
print()
print("Creating titles table...")
c.execute(
"""CREATE TABLE IF NOT EXISTS titles
(titleId TEXT NOT NULL, titleType TEXT,
primaryTitle TEXT, originalTitle TEXT,
isAdult INTEGER, startYear REAL,
endYear REAL, runtimeMinutes REAL,
PRIMARY KEY (titleId)
)
"""
)
# commit changes
conn.commit()
# read the title data
df = load_data("title.basics.tsv")
# replace \N with nan
df.replace("\\N", np.nan, inplace=True)
# rename columns
df.rename(columns={"tconst": "titleId"}, inplace=True)
# drop the genres column
title_df = df.drop("genres", axis=1)
# convert the data types from str to numeric
title_df["startYear"] = pd.to_numeric(title_df["startYear"], errors="coerce")
title_df["endYear"] = pd.to_numeric(title_df["endYear"], errors="coerce")
title_df["runtimeMinutes"] = pd.to_numeric(
title_df["runtimeMinutes"], errors="coerce"
)
# insert the data into titles table
title_df.to_sql("titles", conn, if_exists="replace", index=False)
# commit changes
conn.commit()
# close the connection
conn.close()
print("Completed!")
print()
def create_ratings_table():
# connect to the database
conn = sqlite3.connect("imdb.db")
# create a cursor
c = conn.cursor()
print()
print("Creating ratings table...")
c.execute(
"""CREATE TABLE IF NOT EXISTS ratings
(titleId TEXT NOT NULL, averageRating REAL, numVotes INTEGER,
FOREIGN KEY (titleId) REFERENCES titles(titleId)
)
"""
)
# commit changes
conn.commit()
# read the data
df = load_data("title.ratings.tsv")
df.rename(columns={"tconst": "titleId"}, inplace=True)
# insert the data into the ratings table
df.to_sql("ratings", conn, if_exists="replace", index=False)
# commit changes
conn.commit()
# close the connection
conn.close()
print("Completed!")
print()
Can anyone tell me where am i making the mistake?

Primary key constraint gets removed when creating postgres table from pandas dataframe

I am trying to create few tables in Postgres from pandas dataframe but I am kept getting this error.
psycopg2.errors.InvalidForeignKey: there is no unique constraint matching given keys for referenced table "titles"
After looking into this problem for hours, i finally found that when I am inserting the data into parent table from pandas dataframe, the primary key constraint gets removed for some reasons and due to that I am getting this error when trying to refernece it from another table.
But I am not having this problem when I am using pgAdmin4 to create the table and inserting few rows of data manually.
you can see when I created the tables using pgAdmin, the primary key and foreign keys are getting created as expected and I have no problem with it.
But when I try to insert the data from pandas dataframe using psycopg2 library, the primary key is not getting created.
I Can't able to understand why is this happening.
The code I am using to create the tables -
# function for faster data insertion
def psql_insert_copy(table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = "{}.{}".format(table.schema, table.name)
else:
table_name = table.name
sql = "COPY {} ({}) FROM STDIN WITH CSV".format(table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
def create_titles_table():
# connect to the database
conn = psycopg2.connect(
dbname="imdb",
user="postgres",
password=os.environ.get("DB_PASSWORD"),
host="localhost",
)
# create a cursor
c = conn.cursor()
print()
print("Creating titles table...")
c.execute(
"""CREATE TABLE IF NOT EXISTS titles(
title_id TEXT PRIMARY KEY,
title_type TEXT,
primary_title TEXT,
original_title TEXT,
is_adult INT,
start_year REAL,
end_year REAL,
runtime_minutes REAL
)
"""
)
# commit changes
conn.commit()
# read the title data
df = load_data("title.basics.tsv")
# replace \N with nan
df.replace("\\N", np.nan, inplace=True)
# rename columns
df.rename(
columns={
"tconst": "title_id",
"titleType": "title_type",
"primaryTitle": "primary_title",
"originalTitle": "original_title",
"isAdult": "is_adult",
"startYear": "start_year",
"endYear": "end_year",
"runtimeMinutes": "runtime_minutes",
},
inplace=True,
)
# drop the genres column
title_df = df.drop("genres", axis=1)
# convert the data types from str to numeric
title_df["start_year"] = pd.to_numeric(title_df["start_year"], errors="coerce")
title_df["end_year"] = pd.to_numeric(title_df["end_year"], errors="coerce")
title_df["runtime_minutes"] = pd.to_numeric(
title_df["runtime_minutes"], errors="coerce"
)
# create SQLAlchemy engine
engine = create_engine(
"postgresql://postgres:" + os.environ["DB_PASSWORD"] + "#localhost:5432/imdb"
)
# insert the data into titles table
title_df.to_sql(
"titles", engine, if_exists="replace", index=False, method=psql_insert_copy
)
# commit changes
conn.commit()
# close cursor
c.close()
# close the connection
conn.close()
print("Completed!")
print()
def create_genres_table():
# connect to the database
conn = psycopg2.connect(
dbname="imdb",
user="postgres",
password=os.environ.get("DB_PASSWORD"),
host="localhost",
)
# create a cursor
c = conn.cursor()
print()
print("Creating genres table...")
c.execute(
"""CREATE TABLE IF NOT EXISTS genres(
title_id TEXT NOT NULL,
genre TEXT,
FOREIGN KEY (title_id) REFERENCES titles(title_id)
)
"""
)
# commit changes
conn.commit()
# read the data
df = load_data("title.basics.tsv")
# replace \N with nan
df.replace("\\N", np.nan, inplace=True)
# rename columns
df.rename(columns={"tconst": "title_id", "genres": "genre"}, inplace=True)
# select only relevant columns
genres_df = df[["title_id", "genre"]].copy()
genres_df = genres_df.assign(genre=genres_df["genre"].str.split(",")).explode(
"genre"
)
# create engine
engine = create_engine(
"postgresql://postgres:" + os.environ["DB_PASSWORD"] + "#localhost:5432/imdb"
)
# insert the data into genres table
genres_df.to_sql(
"genres", engine, if_exists="replace", index=False, method=psql_insert_copy
)
# commit changes
conn.commit()
# close cursor
c.close()
# close the connection
conn.close()
print("Completed!")
print()
if __name__ == "__main__":
print()
print("Creating IMDB Database...")
# connect to the database
conn = psycopg2.connect(
dbname="imdb",
user="postgres",
password=os.environ.get("DB_PASSWORD"),
host="localhost",
)
# create the titles table
create_titles_table()
# create genres table
create_genres_table()
# close the connection
conn.close()
print("Done with Everything!")
print()

I think the problem is to_sql(if_exists="replace"). Try using to_sql(if_exists="append") - my understanding is that "replace" drops the whole table and creates a new one with no constraints.

python MySQL update specific column fetchall()

I'm new to python and I want to update every record that has count 0 in the database. I have tried a lot can't find anything like help.
for row in cur.fetchall():
if row[3] == 0:
cur.execute("UPDATE tble SET count = 1 WHERE name = %s" %row[1])

Assuming your table has this structure:
CREATE TABLE `test` (
`sno` int(11) NOT NULL,
`name` varchar(50) NOT NULL,
`count` int(11) NOT NULL,
`dtCreated` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP
);
Here is the simple code code-
import pymysql
conn = pymysql.connect(host='localhost', unix_socket='', user='USER', passwd='PASSWORD', db='DATABASENAME')
cur = conn.cursor()
cur.execute("SELECT * FROM test")
for r in cur:
curr = conn.cursor()
sql = """UPDATE test SET count = 1 WHERE name = '%s'""" % r[1]
# print(sql)
try:
# Execute the SQL command
curr.execute(sql)
# Commit your changes in the database
conn.commit()
except:
# Rollback in case there is any error
conn.rollback()
curr.close()
cur.close()
conn.close()
Also, since you mentioned that you are new to python remember to commit, every time, whenever you run INSERT, UPDATE or DELETE like queries.
Hope it helps.

Python Iterate through list of dictionaries and save to db

Im new to python and trying to save raw post data in python into mysql.
I want to iterate over each element in the json that is posted and save all the data to DB.
json list of objects: (30 objects with each 11 columns)
[
{
"col1":7878,
"col2":"c004979d3969a86a8fdcda2f92eb39e3",
"col3":"b000yht23",
...
"col11":2
},
{
"col1":7878,
"col2":"c004979d3969a86a8fdcda2f92eb39e3",
"col3":"b000yht23"
...
"col11":43
},
#upto 30 objects
....
]
'json_test' table desc:
CREATE TABLE json_test (
`col1` varchar(250) NOT NULL,
`col2` varchar(250) NOT NULL,
`col3` varchar(250) NOT NULL,
`col4` varchar(250) NOT NULL,
`col5` varchar(250) NOT NULL,
`col6` varchar(250) NOT NULL,
`col7` varchar(250) NOT NULL,
`col8` varchar(250) NOT NULL,
`col9` varchar(250) NOT NULL,
`col10` varchar(250) NOT NULL,
`col11` varchar(200) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
UPDATED to save data to DB:
My py code looks like:
from flask import Flask, abort, request
import json
import pymysql
app = Flask(__name__)
#app.route('/foo', methods=['GET','POST'])
def foo():
jsonobject=request.json
if not jsonobject:
abort(400)
# load- converts JSON source text to a Python value
#readable_json=json.dumps(jsonobject)
#UPDATED with column_names
k=0
for i in jsonobject:
# Connect to the database
conn = pymysql.connect(host='10.20.3.4', port=3306, user='root', passwd='', db='python_db')
try:
with conn.cursor() as cursor:
column_names = ['col1','col2','col3',...'col11']
column_names_str = ', '.join(column_names)
binds_str = ', '.join('%s' for _ in range(len(column_names)))
sql=("INSERT INTO `json_test` ({column_names})" \
" VALUES({binds})"
.format(column_names=column_names_str,binds=binds_str))
for data_dict in jsonobject:
values = [data_dict[column_name]
for column_name in column_names]
cursor.execute(sql, values)
print("Insert successfull!")
#UPDATED
k+=1
conn.commit()
finally:
conn.close()
return "Insert successful"
#return json.dumps(jsonobject)
if __name__ == '__main__':
app.run(host='10.22.1.168',debug=True,port=7845)
UPDATED code result:
Only the last record seems to be inserting

Replace this mess
#UPDATED with column_names
k=0
for i in jsonobject:
# Connect to the database
conn = pymysql.connect(host='10.20.3.4', port=3306, user='root', passwd='', db='python_db')
try:
with conn.cursor() as cursor:
column_names = ['col1','col2','col3',...'col11']
column_names_str = ', '.join(column_names)
binds_str = ', '.join('%s' for _ in range(len(column_names)))
sql=("INSERT INTO `json_test` ({column_names})" \
" VALUES({binds})"
.format(column_names=column_names_str,binds=binds_str))
for data_dict in jsonobject:
values = [data_dict[column_name]
for column_name in column_names]
cursor.execute(sql, values)
print("Insert successfull!")
#UPDATED
k+=1
conn.commit()
finally:
conn.close()
return "Insert successful"
with
try:
with conn.cursor() as cursor:
columns_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6',
'col7', 'col8', 'col9', 'col10', 'col11']
columns_names_str = ', '.join(columns_names)
binds_str = ', '.join('%s' for _ in range(len(columns_names)))
for data_dict in jsonobject:
sql = ("INSERT INTO json_test ({columns_names}) "
"VALUES ({binds})"
.format(columns_names=columns_names_str,
binds=binds_str))
values = [data_dict[column_name]
for column_name in columns_names]
cursor.execute(sql, values)
print("Insert successfull!")
conn.commit()
finally:
conn.close()
Summation
k object is redundant,
also name i is unclear and makes me think like it is some kind of index when it is not: it is a dict object,
we don't need to create connection for each object from jsonobject because it is an expensive operation,
we don't need to create sql object on each iteration as well (it remains unchanged),
storing columns names in list/tuple will save us from writing them twice: in a query and in values extraction.
creating binds str
%s, %s, ...
dynamically based on number of columns saves us from typo when we've missed/added too many bind aliases

json.dumps does the opposite of what you claim; it converts a Python object into a string.
The result of request.json is already a Python datastructure. You don't need to do anything else with it.

Python script and mysql database with update query does't work

I have a python script that I created to update a MySQL database the insert work perfect but when I tried to update it nothing happen and it doesn't change.
The console displays this error from the try and except
Unable to print data
Can anyone help me to fix this error?
MySQL database
Database student
Table structure for table stu
Column Type Null Default
ID int(8) No
Name varchar(255) No
subject varchar(255) No
Dumping data for table stu
11 jhon python
12 jina hjsdhjsd
13 jaSDJ JHAISDJ
Python script
#!/usr/bin/python
# UPDATE AND delete some values from the database ###
import MySQLdb
db = MySQLdb.Connect("localhost", "****", "******", "student")
cursor = db.cursor()
sql = "UPDATE STU SET NAME = MAROUN, SUBJECT = C++ WHERE ID = 13 "
try:
cursor.execute(sql)
# r = cursor.fetchall()
# for row in r:
# ID = row[0]
# NAME = row[1]
# SUBJECT = row[2]
# print "ID = %d, LAST_NAME = %s, SUBJECT = %s " %(ID, NAME, SUBJECT)
print "update ok "
except Exception as e:
print e
db.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

My pipeline is duplicating the same table in postgresql - python

You execute 2 times the create table statement: cur.execute(table_py) if cur.execute(table_py) == True: What you wanted to do was: try: cur.execute(table_py) print("All good") except Exception as e: print("An error occurred when initializing the database")

Related

Database ER diagram not showing relationships even though specified

Primary key constraint gets removed when creating postgres table from pandas dataframe

python MySQL update specific column fetchall()

Python Iterate through list of dictionaries and save to db

Python script and mysql database with update query does't work

Categories

Resources