how to copy specific columns through csv file to postgresql - python

import psycopg2
import time
def read_database():
conn = None
try:
conn = psycopg2.connect(database="capitadb", user="capita_user", password="capita_user",
host = "127.0.0.1", port = "5432")
cur = conn.cursor()
start_time = time.time()
cur.execute("COPY stagging(Activity_ID,F_Qtr,Fiscal_Week_Num,Manager,MBadge) FROM '/home/vivek/Downloads/dell_data.csv' DELIMITER',' CSV;;")
print("--- %s seconds ---" % (time.time() - start_time))
print("Operation done successfully")
conn.commit()
except Exception as e:
print("Error: %s" % e)
finally:
conn.close()
if __name__ == '__main__':
read_database()
Here we have 15 columns in csv file, but we want to copy only 4 columns. How will we achive that without extracting data in any file?

You will need to use COPY FROM STDIN functionality - http://initd.org/psycopg/docs/cursor.html#cursor.copy_from. You will be able to provide file-like object to that function. You can use itertools module for that
from itertools import chain, islice
class some_magic_adaptor(object):
def __init__(self, src):
self.src = chain.from_iterable(src)
def read(self, n):
return "".join(islice(self.src, None, n))
def read_csv():
for line in open(csv_filename):
yield transform_line(line)
file_like_object_for_postgresql = some_magic_adaptor(read_csv())

Related

How to fix the DataFrame?

How can I fix the DataFrame visible in the picture below? I mean that "order_reference" and "location" should be the columns.
Here's my code:
import pyodbc
import pandas as pd
import json
import os
from queries import QUERIES
class Connection:
def __init__(self):
try:
self.connection = pyodbc.connect("DSN=Databricks-Spark", autocommit=True)
except Exception as e:
print(e)
raise Exception("Connection failed")
def run_query(self, query_id):
with self.connection as open_connection:
cursor = open_connection.cursor()
cursor = cursor.execute(query_id)
columns = [column[0] for column in cursor.description]
results = []
for row in cursor.fetchall():
results.append(str(dict(zip(columns, row))))
return results
class DataManager:
def __init__(self, QUERIES_DICT):
self.connection = Connection()
self.queries = QUERIES_DICT
self.jsonfile = "mydata.json"
self.data = {}
self.dfs = {}
try:
if os.stat(self.jsonfile).st_size > 0:
self.get_data_from_json()
else:
self.get_data_from_database()
with open(self.jsonfile, "w") as f:
json.dump(self.data, f)
except Exception as e:
print(e)
open(self.jsonfile, "w")
self.get_data_from_database()
with open(self.jsonfile, "w") as f:
json.dump(self.data, f)
def get_df(self):
for query_id in self.queries:
self.dfs[query_id] = pd.DataFrame(self.data[query_id])
return self.dfs
def get_data_from_json(self):
with open(self.jsonfile, "r") as pre_load:
self.data = json.load(pre_load)
def get_data_from_database(self):
for query_id in self.queries:
try:
self.data[query_id] = self.connection.run_query(self.queries[query_id])
except Exception as e:
print(f"Problem with query ID: {query_id}", e)

How to request url and download folders with python?

I'm new to python. I'm trying to request an URL (thanks to an id stored in a postgresql data base) which sends me zip folders with several files inside.
import psycopg2
import requests
url = "https://myurl/"
conn = psycopg2.connect(user="XXX", password="XXX", database="XXX", host="localhost", port="5432")
print("Successfully connected!")
cur = conn.cursor()
sql ="select id from public.base"
cur.execute(sql)
row = [item[0] for item in cur.fetchall()]
for d in row:
requests.post(url+d)
The requests.post(url+d) is working, i have a 200 response.
But I don't know how to do the following steps, that is to say to upload in my workspace these zip folders...
You could use the zipfile & io library for that, specifying your download location within a extractall, like so :)
from psycopg2 import (
connect,
OpertionalError,
)
from zipfile import (
BadZipFile,
ZipFile,
)
from io import BytesIO
import requests
def download_zip(url):
response = requests.get(url)
if response.ok:
try:
z = ZipFile(BytesIO(response.content))
z.extractall("/path/to/destination_directory")
except BadZipFile as ex:
print('Error: {}'.format(ex))
print('Download succeeded: {}'.format(url))
else:
print('Connection failed: {}'.format(url))
def main():
conn = connect(
user='XXX',
password='XXX',
database='XXX',
host='localhost',
port='5432',
)
try:
cur = conn.cursor()
cur.execute('select id from public.base')
except OperationalError:
exit(0)
row = [item[0] for item in cur.fetchall()]
for id in row:
download_zip('https://myurl/{}'.format(id))
print('Download completed')
if __name__ == '__main__':
main()

Python calling one function form another

I have written a little python script to get files in a directory, get a hash and then write them to a table.
The first part, getting the files and calculating the hash was easy. But now I added the function (write_record) to store the filename, log date and hash to a database. But I am struggling how to call it form the get_files function an write a record for each file in the directory
from datetime import datetime
from os import scandir
import os
import hashlib
import psycopg2
BLOCKSIZE = 65536
hasher = hashlib.sha256()
basepath = '.'
def convert_date(timestamp):
d = datetime.utcfromtimestamp(timestamp)
formated_date = d.strftime('%d%m%Y%H%M%S')
return formated_date
def get_hash(entry):
with open(entry, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
# print(hasher.hexdigest())
def get_files():
dir_entries = scandir('.')
for entry in dir_entries:
if entry.is_file():
info = entry.stat()
print(' %s %s %s' % (entry.name, convert_date(info.st_mtime),hasher.hexdigest()))
log_filename = entry.name
log_hashvalue = hasher.hexdigest()
log_date = convert_date(info.st_mtime)
return log_filename,log_hashvalue,log_date
# write_record()
def write_record():
log_filename,log_hashvalue,log_date = get_files()
try:
print(log_filename,log_hashvalue,log_date)
connection = psycopg2.connect(user="postgres",password="xxxxxxxx",host="xxx.xxx.xxx.xxx",port="5432",database="evidence_logging")
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO logfiles (log_name,log_date,log_hashvalue) VALUES (%s,%s,%s)"""
record_to_insert = (log_filename,log_date,log_hashvalue)
print(postgres_insert_query, record_to_insert)
cursor.execute(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print (count, "Record inserted successfully into logfiles table")
except (Exception, psycopg2.Error) as error :
if(connection):
print("Failed to insert record into logfiles table", error)
finally:
#closing database connection.
if(connection):
cursor.close()
connection.close()
print("PostgreSQL connection is closed")
write_record()
Thanks in advance
Regards
Georg
In your code you are calling write_record() method this will insert only one file beacause get_files() method will return the first file not all the files.
first you need to call get_files() method instead of returning in this method you should call write_record() method with the values you are returning from get_files().
And do not close the connection after insertion of every record close the connection after insertion of all the records.
try this
from datetime import datetime
from os import scandir
import os
import hashlib
import psycopg2
BLOCKSIZE = 65536
hasher = hashlib.sha256()
basepath = '.'
connection = None
def convert_date(timestamp):
d = datetime.utcfromtimestamp(timestamp)
formated_date = d.strftime('%d%m%Y%H%M%S')
return formated_date
def get_hash(entry):
with open(entry, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
# print(hasher.hexdigest())
def get_files():
dir_entries = scandir('.')
for entry in dir_entries:
if entry.is_file():
info = entry.stat()
print(' %s %s %s' % (entry.name, convert_date(info.st_mtime),hasher.hexdigest()))
log_filename = entry.name
log_hashvalue = hasher.hexdigest()
log_date = convert_date(info.st_mtime)
write_record(log_filename,log_hashvalue,log_date)
#close the connection after writing all records
close_connection()
def write_record(log_filename,log_hashvalue,log_date):
try:
print(log_filename,log_hashvalue,log_date)
connection = psycopg2.connect(user="postgres",password="xxxxxxxx",host="xxx.xxx.xxx.xxx",port="5432",database="evidence_logging")
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO logfiles (log_name,log_date,log_hashvalue) VALUES (%s,%s,%s)"""
record_to_insert = (log_filename,log_date,log_hashvalue)
print(postgres_insert_query, record_to_insert)
cursor.execute(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print (count, "Record inserted successfully into logfiles table")
except (Exception, psycopg2.Error) as error :
if(connection):
print("Failed to insert record into logfiles table", error)
finally:
cursor.close()
def close_connection():
if(connection):
connection.close()
print("PostgreSQL connection is closed")
get_files()

How to read a CSV file from s3 and write the content in RDS database table using python lambda function?

I have a CSV file Employee.csv in the S3 bucket with all info about employee: name, age, salary, designation.
I have to write a python lambda function to read this file and write in RDS db such as it should create a table as Employee, with columns name, age, salary, designation and rows will have the data.
The Employee.csv is just for example, actually it can be any csv file with any number of columns in it.
from __future__ import print_function
import boto3
import logging
import os
import sys
import uuid
import pymysql
import csv
import rds_config
rds_host = rds_config.rds_host
name = rds_config.db_username
password = rds_config.db_password
db_name = rds_config.db_name
logger = logging.getLogger()
logger.setLevel(logging.INFO)
try:
conn = pymysql.connect(rds_host, user=name, passwd=password, db=db_name, connect_timeout=5)
except Exception as e:
logger.error("ERROR: Unexpected error: Could not connect to MySql instance.")
logger.error(e)
sys.exit()
logger.info("SUCCESS: Connection to RDS mysql instance succeeded")
s3_client = boto3.client('s3')
def handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
download_path = '/tmp/{}{}'.format(uuid.uuid4(), key)
s3_client.download_file(bucket, key,download_path)
csv_data = csv.reader(file( download_path))
with conn.cursor() as cur:
for idx, row in enumerate(csv_data):
logger.info(row)
try:
cur.execute('INSERT INTO target_table(name, age, salary, designation)' \
'VALUES("%s", "%s", "%s", "%s")'
, row)
except Exception as e:
logger.error(e)
if idx % 100 == 0:
conn.commit()
conn.commit()
return 'File loaded into RDS:' + str(download_path)
Here is the code which is working for me now:
s3 = boto3.resource('s3')
file_object=event['Records'][0]
key=str(file_object['s3']['object']['key'])
obj = s3.Object(bucket, key)
content_lines=obj.get()['Body'].read().decode('utf-8').splitlines(True)
tableName= key.strip('folder/').strip('.csv')
with conn.cursor() as cur:
try:
cur.execute('TRUNCATE TABLE '+tableName)
except Exception as e:
print("ERROR: Unexpected error:Table does not exit.")
sys.exit()
header=True
for row in csv.reader(content_lines):
if(header):
numberOfColumns=len(row)
columnNames= str(row).replace('[','').replace(']','').replace("'",'')
print("columnNames:"+columnNames)
values='%s'
numberOfValues=len(values)
numberOfValues=1
while numberOfValues< numberOfColumns:
values=values+",%s"
numberOfValues+=1
print("INSERT into "+tableName+"("+columnNames+") VALUES("+values+")")
header=False
else:
try:
cur.execute('INSERT into '+tableName+'('+columnNames+') VALUES('+values+')', row)
except Exception as e:
raise e
conn.commit()

Return SQL result from function as array

Since the query returns more than 1 result, at the Get_results class how could i return the data_out as an array in order to iterate on the results of the query?
import psycopg2
import sys
class Get_results():
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
try:
con = None
con = psycopg2.connect(host=dbHost, database=dbName,
user=dbUser, password=dbPass)
cur = con.cursor()
cur.execute(query)
data = cur.fetchall()
for data_out in data:
return data_out
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
con.close()
sql = " some sql "
w = Get_results()
for i in w.db_call(sql, dbHost, dbName, dbUser, dbPass):
print "The result is : " + i
For aditional info, when if i add print data right after data = cur.fetchall() i have the result:
[('The_Galaxy', 'The_Galaxy:star'),
('The_Galaxy', 'The_Galaxy:planet')]
The immediate answer is to change:
for data_out in data:
data_out result
to:
for data_out in data:
yield data_out
But you should look at using a with statement (if the DB API supports it), and simplifying the code - this could just be done by making a generator function (a class is OTT for this)
import psycopg2
import sys
class Get_results():
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
try:
con = None
con = psycopg2.connect(host=dbHost, database=dbName,
user=dbUser, password=dbPass)
cur = con.cursor()
cur.execute(query)
data = cur.fetchall()
resultList = []
for data_out in data:
resultList.append(data_out[1])
return resultList
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
con.close()
sql = " some sql "
w = Get_results()
for i in w.db_call(sql, dbHost, dbName, dbUser, dbPass):
print "The result is : " + i

Categories