I have to do the files comparison for the huge(10-20 millions) set of records.
Requirement explanation:
For the files comparison, there will be two files to do the comparison
and find the different records.
The files type are : .txt , .csv , .xlsx , .mdb or .accdb
The File 1 can be any type as mentioned in the first point.
The File 2 can be any type as mentioned in the first point.
The delimiter for File 1 or File 2 are unknown, it may be any from ~^;|.
Each file is having more than 70 columns in each.
File 1 is older than File 2 in terms of records. File 1 may have 10 million and File 2 may have 10.2 millions of records.
Need to create File 3, which consists of different records(for example 0.2 million of records from point 6) from File 1 to File 2 with the column header.
My Try: I have used SET for collecting data from both the files(File1 and File2) and done the comparison
using for and if condition.
import pyodbc
import os.path
import string
import re
import sys
import time
from datetime import datetime
# Function for Do you want to continue
def fun_continue():
# If you want to continue
yesno = raw_input('\nDo you want to continue(Y/N)?')
if yesno == 'Y':
fun_comparison()
else:
sys.exit()
def fun_comparison():
# Getting Input Value's
file1 = raw_input('Enter the file1 name with path:')
file_extension_old = os.path.splitext(file1)[1]
#Condition check for the File extension, if it's ACCESS DB then ask for the table name
if (file_extension_old == ".accdb") or (file_extension_old == ".mdb"):
table_name_old = raw_input('Enter table name:')
file2 = raw_input('Enter the latest file name:')
file_extension_latest = os.path.splitext(file2)[1]
#Condition check for the File extension, if it's ACCESS DB then ask for the table name
if (file_extension_latest == ".accdb") or (file_extension_latest == ".mdb"):
table_name_latest = raw_input('Enter table name:')
file3 = raw_input('Give the file name to store the comparison result:')
print('Files comparison is running! Please wait...')
# Duration Calculation START TIME
start_time = datetime.now()
# Code for file Comparison
try:
#Condition check for the ACCESS FILE -- FILE 1
if (file_extension_old == ".accdb") or (file_extension_old == ".mdb"):
conn_string_old = r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+file1+';'
con_old = pyodbc.connect(conn_string_old)
cur_old = con_old.cursor()
#Getting Column List
res_old = cur_old.execute('SELECT * FROM '+table_name_old+' WHERE 1=0')
column_list = [tuple(map(str, record_new))[0] for record_new in res_old.description]
column_list = ';'.join(column_list)
#For Getting Data
SQLQuery_old = 'SELECT * FROM '+table_name_old+';'
rows_old = cur_old.execute(SQLQuery_old).fetchall()
records_old = [tuple(map(str,record_old)) for record_old in rows_old]
records_old = [";".join(t) + "\n" for t in records_old]
records_old = set(records_old)
records_old = map(str.strip, records_old)
#print records_old
else:
with open(file1) as a:
column_list = a.readline()
column_list = re.sub(r"[;,|^~]", ";", column_list)
a = set(a)
sete = map(str.strip, a)
setf = [re.sub(r"[;,|^~]", ";", s) for s in sete]
records_old = [";".join(map(str.strip, i.split(";"))) for i in setf]
#Condition check for the ACCESS FILE -- FILE 2
if (file_extension_latest == ".accdb") or (file_extension_latest == ".mdb"):
conn_string_new = r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+file2+';'
con_new = pyodbc.connect(conn_string_new)
cur_new = con_new.cursor()
#Getting Column List
res_new = cur_new.execute('SELECT * FROM '+table_name_latest+' WHERE 1=0')
column_list = [tuple(map(str, record_new))[0] for record_new in res_new.description]
column_list = ';'.join(column_list)
SQLQuery_new = 'SELECT * FROM '+table_name_latest+';'
rows_new = cur_new.execute(SQLQuery_new).fetchall()
records_new = [tuple(map(str,record_new)) for record_new in rows_new]
records_new = [";".join(t) + "\n" for t in records_new]
records_new = set(records_new)
records_new = map(str.strip, records_new)
#print records_new
else:
with open(file2) as b:
column_list = b.readline()
column_list = re.sub(r"[;,|^~]", ";", column_list)
b = set(b)
sete = map(str.strip, b)
setf = [re.sub(r"[;,|^~]", ";", s) for s in sete]
records_new = [";".join(map(str.strip, i.split(";"))) for i in setf]
column_list = column_list.strip()
column_list = column_list.replace('; ', ';').strip(' ')
with open(file3, 'w') as result:
result.write(column_list + '\n')
for line in records_new:
if line not in records_old:
result.write(line + '\n')
except Exception as e:
print('\n\nError! Files Comparison completed unsuccessfully.')
print('\nError Details:')
print(e)
# Duration calculation END TIME
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
# Calling Continue function
fun_continue()
# Calling Comparison function
fun_comparison()
input()
Problem:
The code is working fine for small records which i did for testing but its not optimal for the huge records.
System is getting hang.
Consuming more memory as shown below in the screenshot:
This is a piece of code which needs to perform the follow functionality:
Dump all table names in a database
From each table search for a column with either Latitude or Longitude in
Store these co-ords as a json file
The code was tested and working on a single database. However once it was put into another piece of code which calls it with different databases it now is not entering line 49. However there is no error either so I am struggling to see what the issue is as I have not changed anything.
Code snippet line 48 is the bottom line -
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print (cursor)
for tablerow in cursor.fetchall():
I am running this in the /tmp/ dir due to an earlier error with sqlite not working outside the temp.
Any questions please ask them.
Thanks!!
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3
import os
import sys
filename = sys.argv[1]
def validateFile(filename):
filename, fileExt = os.path.splitext(filename)
print ("[Jconsole] Python: Filename being tested - " + filename)
if fileExt == '.db':
databases(filename)
elif fileExt == '.json':
jsons(fileExt)
elif fileExt == '':
blank()
else:
print ('Unsupported format')
print (fileExt)
def validate(number):
try:
number = float(number)
if -90 <= number <= 180:
return True
else:
return False
except ValueError:
pass
def databases(filename):
dbName = sys.argv[2]
print (dbName)
idCounter = 0
mainList = []
lat = 0
lon = 0
with sqlite3.connect(filename) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print (cursor)
for tablerow in cursor.fetchall():
print ("YAY1")
table = tablerow[0]
cursor.execute('SELECT * FROM {t}'.format(t=table))
for row in cursor:
print(row)
print ("YAY")
tempList = []
for field in row.keys():
tempList.append(str(field))
tempList.append(str(row[field]))
for i in tempList:
if i in ('latitude', 'Latitude'):
index = tempList.index(i)
if validate(tempList[index + 1]):
idCounter += 1
tempList.append(idCounter)
(current_item, next_item) = \
(tempList[index], tempList[index + 1])
lat = next_item
if i in ('longitude', 'Longitude'):
index = tempList.index(i)
if validate(tempList[index + 1]):
(current_item, next_item) = \
(tempList[index], tempList[index + 1])
lon = next_item
result = '{ "id": ' + str(idCounter) \
+ ', "content": "' + dbName + '", "title": "' \
+ str(lat) + '", "className": "' + str(lon) \
+ '", "type": "box"},'
mainList.append(result)
file = open('appData.json', 'a')
for item in mainList:
file.write('%s\n' % item)
file.close()
# {
# ...."id": 1,
# ...."content": "<a class='thumbnail' href='./img/thumbs/thumb_IMG_20161102_151122.jpg'>IMG_20161102_151122.jpg</><span><img src='./img/thumbs/thumb_IMG_20161102_151122.jpg' border='0' /></span></a>",
# ...."title": "50.7700721944444",
# ...."className": "-0.8727045",
# ...."start": "2016-11-02 15:11:22",
# ...."type": "box"
# },
def jsons(filename):
print ('JSON')
def blank():
print ('blank')
validateFile(filename)
Fixed.
The issue was up here
filename, fileExt = os.path.splitext(filename)
The filename variable was being overwritten without the file extension so when SQLite searched it didn't find the file.
Strange no error appeared but it is fixed now by changing the filename var to filename1.
I'm programming a script that connects to an Oracle database and get the results into a log file. I want to get a output like this:
FEC_INCLUSION = 2005-08-31 11:43:48,DEBITO_PENDIENTE = None,CAN_CUOTAS = 1.75e-05,COD_CUENTA = 67084,INT_TOTAL = None,CAN_CUOTAS_ANTERIOR = None,COD_INVERSION = 1,FEC_MODIFICACION = 10/04/2012 09:45:22,SAL_TOT_ANTERIOR = None,CUOTA_COMISION = None,FEC_ULT_CALCULO = None,MODIFICADO_POR = CTAPELA,SAL_TOTAL = 0.15,COD_TIPSALDO = 1,MONTO_COMISION = None,COD_EMPRESA = 1,SAL_INFORMATIVO = None,COD_OBJETIVO = 5,SAL_RESERVA = None,INCLUIDO_POR = PVOROPE,APORTE_PROM = 0.0,COSTO_PROM = None,CREDITO_PENDIENTE = None,SAL_PROM = 0.0,
FEC_INCLUSION = 2005-08-31 11:43:49,DEBITO_PENDIENTE = None,CAN_CUOTAS = 0.0,COD_CUENTA = 67086,INT_TOTAL = None,CAN_CUOTAS_ANTERIOR = None,COD_INVERSION = 9,FEC_MODIFICACION = 25/02/2011 04:38:52,SAL_TOT_ANTERIOR = None,CUOTA_COMISION = None,FEC_ULT_CALCULO = None,MODIFICADO_POR = OPEJAMO,SAL_TOTAL = 0.0,COD_TIPSALDO = 1,MONTO_COMISION = None,COD_EMPRESA = 1,SAL_INFORMATIVO = None,COD_OBJETIVO = 5,SAL_RESERVA = None,INCLUIDO_POR = PVOROPE,APORTE_PROM = 0.0,COSTO_PROM = None,CREDITO_PENDIENTE = None,SAL_PROM = 0.0,
I created a dictionary with the query results:
def DictFactory(description,data):
column_names = [col[0] for col in description]
results = []
for row in data:
results.append(dict(zip(column_names,row)))
return results
Then I created this function which finally save the results into my log:
def WriteLog(log_file,header,data):
file_exist = os.path.isfile(log_file)
log = open(log_file,'a')
if not file_exist:
print "File does not exist, writing new log file"
open(log_file,'w').close()
mydata = DictFactory(header,data)
checkpoint_name = ReadCheckpointName()
string = ''
for m in mydata:
for k,v in m.items():
string = string + k + ' = ' + str(v) + ','
if k == checkpoint_name:
#print "KEY FOUND"
cur_checkpoint = v
cur_checkpoint = str(cur_checkpoint)
#print string
string = string + '\n'
print cur_checkpoint
log.write(string + '\n')
WriteCheckpoint(cur_checkpoint,checkpoint_file)
log.close()
This is the main function:
def GetInfo():
mypool = PoolToDB()
con = mypool.acquire()
cursor = con.cursor()
GetLastCheckpoint()
sql = ReadQuery()
#print sql
cursor.execute(sql)
data = cursor.fetchall()
WriteLog(log_file,cursor.description,data)
#WriteCsvLog(log_file,cursor.description,data)
cursor.close()
But I realized that it works if I use a query that fetch few records, however if I try to fetch many records my script never ends.
This is my output when I executed a query with 5000 records. As you can see it takes too long.
jballesteros#SplunkPorvenir FO_TIPSALDOS_X_CUENTA]$ python db_execution.py
Starting connection: 5636
GetLastCheckpoint function took 0.073 ms
GetLastCheckpoint function took 0.025 ms
ReadQuery function took 0.084 ms
File does not exist, writing new log file
DictFactory function took 23.050 ms
ReadCheckpointName function took 0.079 ms
WriteCheckpoint function took 0.204 ms
WriteLog function took 45112.133 ms
GetInfo function took 46193.033 ms
I'm pretty sure you know a much better way to do what I am trying to do.
This is the complete code:
#!/usr/bin/env python
# encoding: utf-8
import re
import sys
try:
import cx_Oracle
except:
print "Error: Oracle module required to run this plugin."
sys.exit(0)
import datetime
import re
import commands
import os
from optparse import OptionParser
import csv
import time
#################################
#### Database Variables ####
#################################
Config = {
"host" : "",
"user" : "",
"password" : "",
"instance" : "",
"port" : "",
}
Query = {
"sql" : "",
"checkpoint_datetype" : "",
"checkpoint_name" : "",
}
dir = '/home/jballesteros/PENS2000/FO_TIPSALDOS_X_CUENTA/'
connection_dir = '/home/jballesteros/PENS2000/Connection'
checkpoint_file = dir + 'checkpoint.conf'
log_file = '/var/log/Pens2000/FO_TIPSALDOS_X_CUENTA.csv'
internal_log = '/var/log/Pens2000/internal.log'
query = dir + 'query'
sys.path.append(os.path.abspath(connection_dir))
from db_connect_pool import *
def Timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print "%s function took %0.3f ms" % (f.func_name,(time2- time1)*1000.0)
return ret
return wrap
#Timing
def InternalLogWriter(message):
now = datetime.datetime.now()
log = open(internal_log, 'a')
log.write("%s ==> %s" % (now.strftime("%Y-%m-%d %H:%M:%S"),message))
log.close()
return
#Timing
def GetLastCheckpoint():
global cur_checkpoint
conf = open(checkpoint_file, 'r')
cur_checkpoint = conf.readline()
cur_checkpoint = cur_checkpoint.rstrip('\n')
cur_checkpoint = cur_checkpoint.rstrip('\r')
conf.close()
#Timing
def ReadQuery():
global cur_checkpoint
GetLastCheckpoint()
qr = open(query, 'r')
line = qr.readline()
line = line.rstrip('\n')
line = line.rstrip('\r')
Query["sql"], Query["checkpoint_datetype"],Query["checkpoint_name"] = line.split(";")
sql = Query["sql"]
checkpoint_datetype = Query["checkpoint_datetype"]
checkpoint_name = Query["checkpoint_name"]
if (checkpoint_datetype == "DATETIME"):
sql = sql + " AND " + checkpoint_name + " >= " + "TO_DATE('%s','YYYY-MM-DD HH24:MI:SS') ORDER BY %s" % (cur_checkpoint,checkpoint_name)
if (checkpoint_datetype == "NUMBER"):
sql = sql + " AND " + checkpoint_name + " > " + "%s ORDER BY %s" % (cur_checkpoint,checkpoint_name)
qr.close()
return str(sql)
#Timing
def ReadCheckpointName():
qr = open(query, 'r')
line = qr.readline()
line = line.rstrip('\n')
line = line.rstrip('\r')
Query["sql"], Query["checkpoint_datetype"],Query["checkpoint_name"] = line.split(";")
checkpoint_name = Query["checkpoint_name"]
return str(checkpoint_name)
#Timing
def LocateCheckPoint(description):
description
checkpoint_name = ReadCheckpointName()
#print checkpoint_name
#print description
startcounter = 0
finalcounter = 0
flag = 0
for d in description:
prog = re.compile(checkpoint_name)
result = prog.match(d[0])
startcounter = startcounter + 1
if result:
finalcounter = startcounter - 1
counterstr = str(finalcounter)
print "Checkpoint found in the array position number: " + counterstr
flag = 1
if (flag == 0):
print "Checkpoint did not found"
return finalcounter
#Timing
def DictFactory(description,data):
column_names = [col[0] for col in description]
results = []
for row in data:
results.append(dict(zip(column_names,row)))
return results
#Timing
def WriteCsvLog(log_file,header,data):
checkpoint_index = LocateCheckPoint(header)
file_exists = os.path.isfile(log_file)
with open(log_file,'ab') as csv_file:
headers = [i[0] for i in header]
csv_writer = csv.writer(csv_file,delimiter='|')
if not file_exists:
print "File does not exist, writing new CSV file"
csv_writer.writerow(headers) # Writing headers once
for d in data:
csv_writer.writerow(d)
cur_checkpoint = d[checkpoint_index]
cur_checkpoint = str(cur_checkpoint)
WriteCheckpoint(cur_checkpoint,checkpoint_file)
csv_file.close()
#Timing
def WriteLog(log_file,header,data):
file_exist = os.path.isfile(log_file)
log = open(log_file,'a')
if not file_exist:
print "File does not exist, writing new log file"
open(log_file,'w').close()
mydata = DictFactory(header,data)
checkpoint_name = ReadCheckpointName()
#prin #string = ''
for m in mydata:
for k,v in m.items():
string = string + k + ' = ' + str(v) + ','
if k == checkpoint_name:
#print "KEY FOUND"
cur_checkpoint = v
cur_checkpoint = str(cur_checkpoint)
#print string
string = string + '\n'
print cur_checkpoint
log.write(string + '\n')
WriteCheckpoint(cur_checkpoint,checkpoint_file)
log.close()
#Timing
def WriteCheckpoint(cur_checkpoint,conf_file):
conf = open(conf_file,'w')
conf.write(cur_checkpoint)
conf.close()
#Timing
def GetInfo():
mypool = PoolToDB()
con = mypool.acquire()
cursor = con.cursor()
GetLastCheckpoint()
sql = ReadQuery()
#print sql
cursor.execute(sql)
#data = cursor.fetchall()
#WriteLog(log_file,cursor.description,data)
#WriteCsvLog(log_file,cursor.description,data)
cursor.close()
def __main__():
parser = OptionParser()
parser.add_option("-c","--change- password",dest="pass_to_change",help="Change the password for database connection",metavar="1")
(options, args) = parser.parse_args()
if (options.pass_to_change):
UpdatePassword()
else:
GetInfo()
__main__()
This is a query sample:
SELECT COD_EMPRESA, COD_TIPSALDO, COD_INVERSION, COD_CUENTA, COD_OBJETIVO, CAN_CUOTAS, SAL_TOTAL, INT_TOTAL, SAL_RESERVA, APORTE_PROM, SAL_PROM, COSTO_PROM, SAL_TOT_ANTERIOR, FEC_ULT_CALCULO, INCLUIDO_POR, FEC_INCLUSION, MODIFICADO_POR, TO_CHAR(FEC_MODIFICACION,'DD/MM/YYYY HH24:MI:SS') AS FEC_MODIFICACION, CUOTA_COMISION, MONTO_COMISION, SAL_INFORMATIVO, CREDITO_PENDIENTE, DEBITO_PENDIENTE, CAN_CUOTAS_ANTERIOR FROM FO.FO_TIPSALDOS_X_CUENTA WHERE ROWNUM <=100000 AND FEC_INCLUSION >= TO_DATE('2005-08-31 11:43:49','YYYY-MM-DD HH24:MI:SS') ORDER BY FEC_INCLUSION
PS: I've really been searching in google and this forum about my question but I haven't found anything similar.
From http://fuzzytolerance.info/blog/2012/01/13/2012-01-14-updating-google-fusion-table-from-a-csv-file-using-python/ I have edited his code to import the necessary modules, however I get the following error "AttributeError: 'module' object has no attribute 'urlencode'". I run the code and I am prompted to enter my password, I enter my own google account password, and then the code gives me the error message, pehaps I need to define a password somewhere?
I wonder if anyone can please trouble shoot my code or advise me on how to avoid this error or even advise me of an EASIER way to import a CSV into a GOOGLE FUSION TABLE that I OWN
Here is my code
import csv
from decimal import *
import getpass
from fusiontables.authorization.clientlogin import ClientLogin
from fusiontables import ftclient
nameAgeNick = 'C:\\Users\\User\\Desktop\\NameAgeNickname.txt'
# check to see if something is an integer
def isInt(s):
try:
int(s)
return True
except ValueError:
return False
# check to see if something is a float
def isFloat(s):
try:
float(s)
return True
except ValueError:
return False
# open the CSV file
ifile = open(nameAgeNick, "rb")
reader = csv.reader(ifile)
# GFT table ID
tableID = "tableid"
# your username
username = "username"
# prompt for your password - you can hardcode it but this is more secure
password = getpass.getpass("Enter your password:")
# Get token and connect to GFT
token = ClientLogin().authorize(username, password)
ft_client = ftclient.ClientLoginFTClient(token)
# Loop through the CSV data and upload
# Assumptions for my data: if it's a float less than 0, it's a percentage
# Floats are being rounded to 1 significant digit
# Non-numbers are wrapped in a single quote for string-type in the updatate statement
# The first row is the column names and matches exactly the column names in Fustion tables
# The first column is the unique ID I'll use to select the record for updating in Fusion Tables
rownum = 0
setList = list()
nid = 0
for row in reader:
# Save header row.
if rownum == 0:
header = row
else:
colnum = 0
setList[:] = []
for col in row:
thedata = col
# This bit rounds numbers and turns numbers < 1 into percentages
if isFloat(thedata):
if isInt(thedata) is False:
if float(thedata) < 1:
thedata = float(thedata) * 100
thedata = round(float(thedata), 1)
else:
thedata = "'" + thedata + "'"
# make sql where clause for row
setList.append(header[colnum] + "=" + str(thedata))
nid = row[0]
colnum += 1
# get rowid and update the record
rowid = ft_client.query("select ROWID from " + tableID + " where ID = " + nid).split("\n")[1]
print( rowid)
print( ft_client.query("update " + tableID + " set " + ",".join(map(str, setList)) + " where rowid = '" + rowid + "'"))
rownum += 1
ifile.close()
And this is the module where the error occurs:
#!/usr/bin/python
#
# Copyright (C) 2010 Google Inc.
""" ClientLogin.
"""
__author__ = 'kbrisbin#google.com (Kathryn Brisbin)'
import urllib, urllib2
class ClientLogin():
def authorize(self, username, password):
auth_uri = 'https://www.google.com/accounts/ClientLogin'
authreq_data = urllib.urlencode({ //////HERE IS ERROR
'Email': username,
'Passwd': password,
'service': 'fusiontables',
'accountType': 'HOSTED_OR_GOOGLE'})
auth_req = urllib2.Request(auth_uri, data=authreq_data)
auth_resp = urllib2.urlopen(auth_req)
auth_resp_body = auth_resp.read()
auth_resp_dict = dict(
x.split('=') for x in auth_resp_body.split('\n') if x)
return auth_resp_dict['Auth']
Hello to all passionate programmers out there. I need your help with my code.
My Goal: To efficiently move data from Amazon S3 to Amazon Redshift.
Basically I am moving all CSV files on my S3 to Redshift using the below code. I parse through part of the file, build a table structure and then use the copy command to load data into redshift.
'''
Created on Feb 25, 2015
#author: Siddartha.Reddy
'''
import sys
from boto.s3 import connect_to_region
from boto.s3.connection import Location
import csv
import itertools
import psycopg2
''' ARGUMENTS TO PASS '''
AWS_KEY = sys.argv[1]
AWS_SECRET_KEY = sys.argv[2]
S3_DOWNLOAD_PATH = sys.argv[3]
REDSHIFT_SCHEMA = sys.argv[4]
TABLE_NAME = sys.argv[5]
UTILS = S3_DOWNLOAD_PATH.split('/')
class UTIL():
global UTILS
def bucket_name(self):
self.BUCKET_NAME = UTILS[0]
return self.BUCKET_NAME
def path(self):
self.PATH = ''
offset = 0
for value in UTILS:
if offset == 0:
offset += 1
else:
self.PATH = self.PATH + value + '/'
return self.PATH[:-1]
def GETDATAINMEMORY():
conn = connect_to_region(Location.USWest2,aws_access_key_id = AWS_KEY,
aws_secret_access_key = AWS_SECRET_KEY,
is_secure=False,host='s3-us-west-2.amazonaws.com'
)
ut = util()
BUCKET_NAME = ut.bucket_name()
PATH = ut.path()
filelist = conn.lookup(BUCKET_NAME)
''' Fecth part of the data from S3 '''
for path in filelist:
if PATH in path.name:
DATA = path.get_contents_as_string(headers={'Range': 'bytes=%s-%s' % (0,100000000)})
return DATA
def TRAVERSEDATA():
DATA = getdatainmemory()
CREATE_TABLE_QUERY = 'CREATE TABLE ' + REDSHIFT_SCHEMA + '.' + TABLE_NAME + '( '
JUNKED_OUT = DATA[3:]
PROCESSED_DATA = JUNKED_OUT.split('\n')
CSV_DATA = csv.reader(PROCESSED_DATA,delimiter=',')
COUNTER,STRING,NUMBER = 0,0,0
COLUMN_TYPE = []
''' GET COLUMN NAMES AND COUNT '''
for line in CSV_DATA:
NUMBER_OF_COLUMNS = len(line)
COLUMN_NAMES = line
break;
''' PROCESS COLUMN NAMES '''
a = 0
for REMOVESPACE in COLUMN_NAMES:
TEMPHOLDER = REMOVESPACE.split(' ')
temp1 = ''
for x in TEMPHOLDER:
temp1 = temp1 + x
COLUMN_NAMES[a] = temp1
a = a + 1
''' GET COLUMN DATA TYPES '''
# print(NUMBER_OF_COLUMNS,COLUMN_NAMES,COUNTER)
# print(NUMBER_OF_COLUMNS)
i,j,a= 0,500,0
while COUNTER < NUMBER_OF_COLUMNS:
for COLUMN in itertools.islice(CSV_DATA,i,j+1):
if COLUMN[COUNTER].isdigit():
NUMBER = NUMBER + 1
else:
STRING = STRING + 1
if NUMBER == 501:
COLUMN_TYPE.append('INTEGER')
# print('I CAME IN')
NUMBER = 0
else:
COLUMN_TYPE.append('VARCHAR(2500)')
STRING = 0
COUNTER = COUNTER + 1
# print(COUNTER)
COUNTER = 0
''' BUILD SCHEMA '''
while COUNTER < NUMBER_OF_COLUMNS:
if COUNTER == 0:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' NOT NULL,'
else:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' ,'
COUNTER += 1
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY[:-2]+ ')'
return CREATE_TABLE_QUERY
def COPY_COMMAND():
S3_PATH = 's3://' + S3_DOWNLOAD_PATH
COPY_COMMAND = "COPY "+REDSHIFT_SCHEMA+"."+TABLE_NAME+" from '"+S3_PATH+"' credentials 'aws_access_key_id="+AWS_KEY+";aws_secret_access_key="+AWS_SECRET_KEY+"' REGION 'us-west-2' csv delimiter ',' ignoreheader as 1 TRIMBLANKS maxerror as 500"
return COPY_COMMAND
def S3TOREDSHIFT():
conn = psycopg2.connect("dbname='xxx' port='5439' user='xxx' host='xxxxxx' password='xxxxx'")
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS '+ REDSHIFT_SCHEMA + "." + TABLE_NAME)
SCHEMA = TRAVERSEDATA()
print(SCHEMA)
cursor.execute(SCHEMA)
COPY = COPY_COMMAND()
print(COPY)
cursor.execute(COPY)
conn.commit()
S3TOREDSHIFT()
Current Challenges:
Challenges with creating the table structure :
Field lengths : Right now I am just hardcoding the VARCHAR fields to 2500. All my files are > 30gb and parsing through the whole file to calculate length of a field takes lot of processing time.
Determining if a column is null: I am simply hard coding the first column to NOT NULL using the COUNTER variable. ( All my files have ID as first column ). Would like to know if there is a better way of doing it.
Is there any data structure I can use? I am always interested in learning new ways to improve the performance, if you guys have any suggestions please feel free to comment.