APIErrorException: (BadArgument) 'recognitionModel' is incompatible :AZURE COGNITIVE FACE - python

I'm creating an attendance system using AZURE COGNITIVE FACE API. I am storing the attendance in an excel sheet. But there occurs an error " 'recognitionModel' is incompatible." From the documentation I have come to know that there are two recognition models(recognition_01 , recognition_02). Is it required to mention the type? If so how to do it in python?
ERROR:
File "identify.py", line 58, in <module>
res = face_client.face.identify(faceIds, global_var.personGroupId)
File "C:\Python\Python36\lib\site-packages\azure\cognitiveservices\vision\face\operations\_face_operations.py", line 313, in identify
raise models.APIErrorException(self._deserialize, response)
azure.cognitiveservices.vision.face.models._models_py3.APIErrorException: (BadArgument) 'recognitionModel' is incompatible.
CODE:
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.face.models import TrainingStatusType, Person, SnapshotObjectType, OperationStatusType
import global_variables as global_var
import os, urllib
import sqlite3
from openpyxl import Workbook, load_workbook
from openpyxl.utils import get_column_letter, column_index_from_string
from openpyxl.cell import Cell
import time
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#get current date
currentDate = time.strftime("%d_%m_%y")
wb = load_workbook(filename = "reports.xlsx")
sheet = wb['Cse16']
def getDateColumn():
for i in range(1, len(list(sheet.rows)[0]) + 1):
col = get_column_letter(i)
if sheet['%s%s'% (col,'1')].value == currentDate:
return col
Key = global_var.key
ENDPOINT = 'https://centralindia.api.cognitive.microsoft.com'
face_client = FaceClient(ENDPOINT,CognitiveServicesCredentials(Key))
connect = sqlite3.connect("Face-DataBase")
attend = [0 for i in range(60)]
currentDir = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(currentDir, 'Cropped_faces')
for filename in os.listdir(directory):
if filename.endswith(".jpg"):
print(filename)
img_data = open(os.path.join(directory,filename), 'r+b')
res = face_client.face.detect_with_stream(img_data)
print("Res = {}".format(res))
if len(res) < 1:
print("No face detected.")
continue
faceIds = []
for face in res:
faceIds.append(face.face_id)
res = face_client.face.identify(faceIds, global_var.personGroupId) #Error occuring line
#print(filename)
print("res = {}".format(res))
for face in res:
if not face['candidates']:
print("Unknown")
else:
personId = face['candidates'][0]['personId']
print("personid = {}".format(personId))
#cmd = + personId
cur = connect.execute("SELECT * FROM Students WHERE personID = (?)", (personId,))
#print("cur = {}".format(cur))
for row in cur:
print("aya")
print("row = {}".format(row))
attend[int(row[0])] += 1
print("---------- " + row[1] + " recognized ----------")
time.sleep(6)
for row in range(2, len(list(sheet.columns)[0]) + 1):
rn = sheet.cell(row = row, column =1).value
if rn is not None:
print("rn = {}".format(rn))
rn = rn[-2:]
if attend[int(rn)] != 0:
col = getDateColumn()
print("col = {}".format(col))
sheet['%s%s' % (col, str(row))] = 0
wb.save(filename = "reports.xlsx")

As mentioned in the services documentation portal (for example here for West Europe but is the same for all regions) for Identify operation:
The 'recognitionModel' associated with the query faces' faceIds should
be the same as the 'recognitionModel' used by the target person group
or large person group.
So it looks like you have a mismatch here. You don't have to pass the recognitionModel in the Identify operation but in the Detect operation that you are doing first.
And you must ensure that this value is the same as the one used for your personGroup where you want to identify the person (see personGroup create operation, containing the recognition variable)

Related

Matching Regex in Python from Excelfile

Im using Regex to match the following excel file and Im struggling with how I can
seperate each row by
Timestamp [0:00:48],
ID 20052A
and the content content (more content)
This is the excel row (one of many, so the ID can vary from row to row and the timestamp as well as the content too)
[0:00:48] 20052A: content (more content)
I get an Error code
AttributeError: 'NoneType' object has no attribute 'group
for matching my ID where I have
(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c)
Keep in mind that from time to time the ID looks something like this
[0:00:33] 30091aA: (content) 
My whole skript is (cancel out the connection to database)
import os
import re
import pymysql
pymysql.install_as_MySQLdb()
import pandas as pd
import sqlalchemy
def insert_or_update(engine, pd_table, table_name):
inserts = 0
updates = 0
for i in range(len(pd_table)):
vals_with_quotes = ["'" + str(x) + "'" for x in pd_table.loc[i, :].values]
# print(vals_with_quotes)
update_pairs = [str(c) + " = '" + str(v) + "'" for c, v in zip(pd_table.columns, pd_table.loc[i, :])]
query = f"INSERT INTO {table_name} ({', '.join(list(pd_table.columns.values))}) " \
f"VALUES ({', '.join(vals_with_quotes)}) " \
f"ON DUPLICATE KEY UPDATE {', '.join(update_pairs)}"
print(query)
result = engine.execute(query)
if result.lastrowid == 0:
updates += 1
else:
inserts += 1
print(f"Inserted {inserts} rows and updated {updates} rows.")
schema = '---'
alchemy_connect = "---"
engine = sqlalchemy.create_engine(alchemy_connect) # connect to server
engine.execute(f"USE {schema}") # select new db
# engine.execute("SET NAMES UTF8MB4;")
query = "SELECT * FROM .... where ...=..."
pm = pd.read_sql(query, engine)
rootpath = "path/"
for root, dirs, files in os.walk(rootpath):
for file in files:
print(root, dirs, files, file)
d = pd.read_excel(root + file, header=None)
d.drop(columns=[0], inplace=True)
d.rename(columns={1: "content"}, inplace=True)
participants = []
for ix, row in d.iterrows():
c = row["content"]
match = re.search(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c)
prefix = match.group(1)
only_content = match.group(2)
try:
timestamp = re.search(r"\[(\d{1,2}:\d{1,2}:\d{1,2})\]", prefix).group(1)
except:
timestamp = "-99"
# print(timestamp)
if re.search(r"\s(Versuchsleiter|ersuchsleiter|Versuchsleit|Versuch):", prefix):
id_code = "Versuchsleiter"
else:
starting_digits = re.search(r"^(\d+)", prefix)
id_code = re.search(r"(\d{2,4}.{1,3}):", prefix).group(1)
if hasattr(starting_digits, 'group'):
id_code = starting_digits.group(1) + id_code #
# get pid
participant = pm.loc[pm["id_code"] == id_code, "pid"]
try:
pid = participant.values[0]
except:
pid = "Versuchsleiter"
# print(ix, pid, id_code, only_content, timestamp)
if pid and pid not in participants and pid != "Versuchsleiter":
participants.append(pid)
d.loc[ix, "pid"] = pid
d.loc[ix, "timestamp"] = timestamp
d.loc[ix, "content"] = only_content.strip()
d.loc[ix, "is_participant"] = 0 if pid == "Versuchsleiter" else 1
d = d[["pid", "is_participant", "content", "timestamp"]]
d.loc[(d['pid'] == "Versuchsleiter"), "pid"] = participants[0]
d.loc[(d['pid'] == None), "pid"] = participants[0]
insert_or_update(engine, d, "table of sql")```
I need "Versuchsleiter" since some of the ID's are "Versuchsleiter"
Thank you!
You should take advantage from using capturing groups.
All the initial regex matching (after c = row["content"] and before # get pid) can be done with
match = re.search(r"^\[(\d{1,2}:\d{1,2}:\d{1,2})]\s+(\w+):\s*(.*)", c)
if match:
timestamp = match.group(1)
id_code = match.group(2)
only_content = match.group(3)
if re.search(r"(?:Versuch(?:sleit(?:er)?)?|ersuchsleiter)", id_code):
id_code = "Versuchsleiter"
Your timestamp will be 0:00:33, only_content will hold (content) and id_code will contain 30091aA.
See the regex demo
Thank you for your help but this gives me the following error
Traceback (most recent call last):
File "C:/Users/.../PycharmProjects/.../.../....py", line 80, in <module>
insert_or_update(engine, d, "sql table")
TypeError: not enough arguments for format string

Files comparison optimization

I have to do the files comparison for the huge(10-20 millions) set of records.
Requirement explanation:
For the files comparison, there will be two files to do the comparison
and find the different records.
The files type are : .txt , .csv , .xlsx , .mdb or .accdb
The File 1 can be any type as mentioned in the first point.
The File 2 can be any type as mentioned in the first point.
The delimiter for File 1 or File 2 are unknown, it may be any from ~^;|.
Each file is having more than 70 columns in each.
File 1 is older than File 2 in terms of records. File 1 may have 10 million and File 2 may have 10.2 millions of records.
Need to create File 3, which consists of different records(for example 0.2 million of records from point 6) from File 1 to File 2 with the column header.
My Try: I have used SET for collecting data from both the files(File1 and File2) and done the comparison
using for and if condition.
import pyodbc
import os.path
import string
import re
import sys
import time
from datetime import datetime
# Function for Do you want to continue
def fun_continue():
# If you want to continue
yesno = raw_input('\nDo you want to continue(Y/N)?')
if yesno == 'Y':
fun_comparison()
else:
sys.exit()
def fun_comparison():
# Getting Input Value's
file1 = raw_input('Enter the file1 name with path:')
file_extension_old = os.path.splitext(file1)[1]
#Condition check for the File extension, if it's ACCESS DB then ask for the table name
if (file_extension_old == ".accdb") or (file_extension_old == ".mdb"):
table_name_old = raw_input('Enter table name:')
file2 = raw_input('Enter the latest file name:')
file_extension_latest = os.path.splitext(file2)[1]
#Condition check for the File extension, if it's ACCESS DB then ask for the table name
if (file_extension_latest == ".accdb") or (file_extension_latest == ".mdb"):
table_name_latest = raw_input('Enter table name:')
file3 = raw_input('Give the file name to store the comparison result:')
print('Files comparison is running! Please wait...')
# Duration Calculation START TIME
start_time = datetime.now()
# Code for file Comparison
try:
#Condition check for the ACCESS FILE -- FILE 1
if (file_extension_old == ".accdb") or (file_extension_old == ".mdb"):
conn_string_old = r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+file1+';'
con_old = pyodbc.connect(conn_string_old)
cur_old = con_old.cursor()
#Getting Column List
res_old = cur_old.execute('SELECT * FROM '+table_name_old+' WHERE 1=0')
column_list = [tuple(map(str, record_new))[0] for record_new in res_old.description]
column_list = ';'.join(column_list)
#For Getting Data
SQLQuery_old = 'SELECT * FROM '+table_name_old+';'
rows_old = cur_old.execute(SQLQuery_old).fetchall()
records_old = [tuple(map(str,record_old)) for record_old in rows_old]
records_old = [";".join(t) + "\n" for t in records_old]
records_old = set(records_old)
records_old = map(str.strip, records_old)
#print records_old
else:
with open(file1) as a:
column_list = a.readline()
column_list = re.sub(r"[;,|^~]", ";", column_list)
a = set(a)
sete = map(str.strip, a)
setf = [re.sub(r"[;,|^~]", ";", s) for s in sete]
records_old = [";".join(map(str.strip, i.split(";"))) for i in setf]
#Condition check for the ACCESS FILE -- FILE 2
if (file_extension_latest == ".accdb") or (file_extension_latest == ".mdb"):
conn_string_new = r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+file2+';'
con_new = pyodbc.connect(conn_string_new)
cur_new = con_new.cursor()
#Getting Column List
res_new = cur_new.execute('SELECT * FROM '+table_name_latest+' WHERE 1=0')
column_list = [tuple(map(str, record_new))[0] for record_new in res_new.description]
column_list = ';'.join(column_list)
SQLQuery_new = 'SELECT * FROM '+table_name_latest+';'
rows_new = cur_new.execute(SQLQuery_new).fetchall()
records_new = [tuple(map(str,record_new)) for record_new in rows_new]
records_new = [";".join(t) + "\n" for t in records_new]
records_new = set(records_new)
records_new = map(str.strip, records_new)
#print records_new
else:
with open(file2) as b:
column_list = b.readline()
column_list = re.sub(r"[;,|^~]", ";", column_list)
b = set(b)
sete = map(str.strip, b)
setf = [re.sub(r"[;,|^~]", ";", s) for s in sete]
records_new = [";".join(map(str.strip, i.split(";"))) for i in setf]
column_list = column_list.strip()
column_list = column_list.replace('; ', ';').strip(' ')
with open(file3, 'w') as result:
result.write(column_list + '\n')
for line in records_new:
if line not in records_old:
result.write(line + '\n')
except Exception as e:
print('\n\nError! Files Comparison completed unsuccessfully.')
print('\nError Details:')
print(e)
# Duration calculation END TIME
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
# Calling Continue function
fun_continue()
# Calling Comparison function
fun_comparison()
input()
Problem:
The code is working fine for small records which i did for testing but its not optimal for the huge records.
System is getting hang.
Consuming more memory as shown below in the screenshot:

IndexError: list index out of range google-api linkedin

*Hello guys,
The code can be seen below(doesn't belong to me) and I'm trying to run it without the environment path which is ok because I used to just run it with python.exe (open with). The code is used for creating an excel file with contact details and some other information with the data.
But now when I try to run the command, I get the following error for some reason. Any help would be appreciated since I'm new to API.
Thank you.*
The code I run on cmd which contains the two inputs
after opening the file location (cd)
lee.py "<job_role + 'email me at' + company>" 10
The Error:
Traceback <most recent call last>:
File "C:\Users\z003wdaf\Desktop\linkedin\lee.py", line 15, in <module>
search_term = sys.argv[1]
IndexError: list index out of range
The code:
from googleapiclient.discovery import build
import datetime as dt
import sys
from xlwt import Workbook
import re
if __name__ == '__main__':
now_sfx = dt.datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir = './output/'
output_fname = output_dir + 'srch_res_' + now_sfx + '.xls'
search_term = sys.argv[1]
num_requests = int(sys.argv[2])
my_api_key = "My API Key"
my_cse_id = "011658049436509675749:gkuaxghjf5u"
service = build("customsearch", "v1", developerKey=my_api_key)
wb=Workbook()
sheet1 = wb.add_sheet(search_term[0:15])
wb.save(output_fname)
sheet1.write(0,0,'Name')
sheet1.write(0,1,'Profile Link')
sheet1.write(0,2,'Snippet')
sheet1.write(0,3,'Present Organisation')
sheet1.write(0,4,'Location')
sheet1.write(0,5,'Role')
sheet1.write(0,6,'Email')
sheet1.col(0).width = 256 * 20
sheet1.col(1).width = 256 * 50
sheet1.col(2).width = 256 * 100
sheet1.col(3).width = 256 * 20
sheet1.col(4).width = 256 * 20
sheet1.col(5).width = 256 * 50
sheet1.col(6).width = 256 * 50
wb.save(output_fname)
row = 1
def google_search(search_term, cse_id, start_val, **kwargs):
res = service.cse().list(q=search_term, cx=cse_id, start=start_val, **kwargs).execute()
return res
for i in range(0, num_requests):
start_val = 1 + (i * 10)
results = google_search(search_term,
my_cse_id,
start_val,
num=10
)
for profile in range (0, 10):
snippet = results['items'][profile]['snippet']
myList = [item for item in snippet.split('\n')]
newSnippet = ' '.join(myList)
contain = re.search(r'[\w\.-]+#[\w\.-]+', newSnippet)
if contain is not None:
title = results['items'][profile]['title']
link = results['items'][profile]['link']
org = "-NA-"
location = "-NA-"
role = "-NA-"
if 'person' in results['items'][profile]['pagemap']:
if 'org' in results['items'][profile]['pagemap']['person'][0]:
org = results['items'][profile]['pagemap']['person'][0]['org']
if 'location' in results['items'][profile]['pagemap']['person'][0]:
location = results['items'][profile]['pagemap']['person'][0]['location']
if 'role' in results['items'][profile]['pagemap']['person'][0]:
role = results['items'][profile]['pagemap']['person'][0]['role']
print(title[:-23])
sheet1.write(row,0,title[:-23])
sheet1.write(row,1,link)
sheet1.write(row,2,newSnippet)
sheet1.write(row,3,org)
sheet1.write(row,4,location)
sheet1.write(row,5,role)
sheet1.write(row,6,contain[0])
print('Wrote {} search result(s)...'.format(row))
wb.save(output_fname)
row = row + 1
print('Output file "{}" written.'.format(output_fname))
Your code is expecting two inputs in the command line when you run the file - the first input will be your search term and the second will be the amount of requests.
Instead of running just python.exe C:\Users\z003wdaf\Desktop\linkedin\lee.py in your terminal add some extra arguments:
python.exe C:\Users\z003wdaf\Desktop\linkedin\lee.py random_thing 5
Where you replace random_thing and 5 with whatever you want.

Import a CSV to Google Fusion Table with python

From http://fuzzytolerance.info/blog/2012/01/13/2012-01-14-updating-google-fusion-table-from-a-csv-file-using-python/ I have edited his code to import the necessary modules, however I get the following error "AttributeError: 'module' object has no attribute 'urlencode'". I run the code and I am prompted to enter my password, I enter my own google account password, and then the code gives me the error message, pehaps I need to define a password somewhere?
I wonder if anyone can please trouble shoot my code or advise me on how to avoid this error or even advise me of an EASIER way to import a CSV into a GOOGLE FUSION TABLE that I OWN
Here is my code
import csv
from decimal import *
import getpass
from fusiontables.authorization.clientlogin import ClientLogin
from fusiontables import ftclient
nameAgeNick = 'C:\\Users\\User\\Desktop\\NameAgeNickname.txt'
# check to see if something is an integer
def isInt(s):
try:
int(s)
return True
except ValueError:
return False
# check to see if something is a float
def isFloat(s):
try:
float(s)
return True
except ValueError:
return False
# open the CSV file
ifile = open(nameAgeNick, "rb")
reader = csv.reader(ifile)
# GFT table ID
tableID = "tableid"
# your username
username = "username"
# prompt for your password - you can hardcode it but this is more secure
password = getpass.getpass("Enter your password:")
# Get token and connect to GFT
token = ClientLogin().authorize(username, password)
ft_client = ftclient.ClientLoginFTClient(token)
# Loop through the CSV data and upload
# Assumptions for my data: if it's a float less than 0, it's a percentage
# Floats are being rounded to 1 significant digit
# Non-numbers are wrapped in a single quote for string-type in the updatate statement
# The first row is the column names and matches exactly the column names in Fustion tables
# The first column is the unique ID I'll use to select the record for updating in Fusion Tables
rownum = 0
setList = list()
nid = 0
for row in reader:
# Save header row.
if rownum == 0:
header = row
else:
colnum = 0
setList[:] = []
for col in row:
thedata = col
# This bit rounds numbers and turns numbers < 1 into percentages
if isFloat(thedata):
if isInt(thedata) is False:
if float(thedata) < 1:
thedata = float(thedata) * 100
thedata = round(float(thedata), 1)
else:
thedata = "'" + thedata + "'"
# make sql where clause for row
setList.append(header[colnum] + "=" + str(thedata))
nid = row[0]
colnum += 1
# get rowid and update the record
rowid = ft_client.query("select ROWID from " + tableID + " where ID = " + nid).split("\n")[1]
print( rowid)
print( ft_client.query("update " + tableID + " set " + ",".join(map(str, setList)) + " where rowid = '" + rowid + "'"))
rownum += 1
ifile.close()​
And this is the module where the error occurs:
#!/usr/bin/python
#
# Copyright (C) 2010 Google Inc.
""" ClientLogin.
"""
__author__ = 'kbrisbin#google.com (Kathryn Brisbin)'
import urllib, urllib2
class ClientLogin():
def authorize(self, username, password):
auth_uri = 'https://www.google.com/accounts/ClientLogin'
authreq_data = urllib.urlencode({ //////HERE IS ERROR
'Email': username,
'Passwd': password,
'service': 'fusiontables',
'accountType': 'HOSTED_OR_GOOGLE'})
auth_req = urllib2.Request(auth_uri, data=authreq_data)
auth_resp = urllib2.urlopen(auth_req)
auth_resp_body = auth_resp.read()
auth_resp_dict = dict(
x.split('=') for x in auth_resp_body.split('\n') if x)
return auth_resp_dict['Auth']
​

Need help to improve performance of my python code

Hello to all passionate programmers out there. I need your help with my code.
My Goal: To efficiently move data from Amazon S3 to Amazon Redshift.
Basically I am moving all CSV files on my S3 to Redshift using the below code. I parse through part of the file, build a table structure and then use the copy command to load data into redshift.
'''
Created on Feb 25, 2015
#author: Siddartha.Reddy
'''
import sys
from boto.s3 import connect_to_region
from boto.s3.connection import Location
import csv
import itertools
import psycopg2
''' ARGUMENTS TO PASS '''
AWS_KEY = sys.argv[1]
AWS_SECRET_KEY = sys.argv[2]
S3_DOWNLOAD_PATH = sys.argv[3]
REDSHIFT_SCHEMA = sys.argv[4]
TABLE_NAME = sys.argv[5]
UTILS = S3_DOWNLOAD_PATH.split('/')
class UTIL():
global UTILS
def bucket_name(self):
self.BUCKET_NAME = UTILS[0]
return self.BUCKET_NAME
def path(self):
self.PATH = ''
offset = 0
for value in UTILS:
if offset == 0:
offset += 1
else:
self.PATH = self.PATH + value + '/'
return self.PATH[:-1]
def GETDATAINMEMORY():
conn = connect_to_region(Location.USWest2,aws_access_key_id = AWS_KEY,
aws_secret_access_key = AWS_SECRET_KEY,
is_secure=False,host='s3-us-west-2.amazonaws.com'
)
ut = util()
BUCKET_NAME = ut.bucket_name()
PATH = ut.path()
filelist = conn.lookup(BUCKET_NAME)
''' Fecth part of the data from S3 '''
for path in filelist:
if PATH in path.name:
DATA = path.get_contents_as_string(headers={'Range': 'bytes=%s-%s' % (0,100000000)})
return DATA
def TRAVERSEDATA():
DATA = getdatainmemory()
CREATE_TABLE_QUERY = 'CREATE TABLE ' + REDSHIFT_SCHEMA + '.' + TABLE_NAME + '( '
JUNKED_OUT = DATA[3:]
PROCESSED_DATA = JUNKED_OUT.split('\n')
CSV_DATA = csv.reader(PROCESSED_DATA,delimiter=',')
COUNTER,STRING,NUMBER = 0,0,0
COLUMN_TYPE = []
''' GET COLUMN NAMES AND COUNT '''
for line in CSV_DATA:
NUMBER_OF_COLUMNS = len(line)
COLUMN_NAMES = line
break;
''' PROCESS COLUMN NAMES '''
a = 0
for REMOVESPACE in COLUMN_NAMES:
TEMPHOLDER = REMOVESPACE.split(' ')
temp1 = ''
for x in TEMPHOLDER:
temp1 = temp1 + x
COLUMN_NAMES[a] = temp1
a = a + 1
''' GET COLUMN DATA TYPES '''
# print(NUMBER_OF_COLUMNS,COLUMN_NAMES,COUNTER)
# print(NUMBER_OF_COLUMNS)
i,j,a= 0,500,0
while COUNTER < NUMBER_OF_COLUMNS:
for COLUMN in itertools.islice(CSV_DATA,i,j+1):
if COLUMN[COUNTER].isdigit():
NUMBER = NUMBER + 1
else:
STRING = STRING + 1
if NUMBER == 501:
COLUMN_TYPE.append('INTEGER')
# print('I CAME IN')
NUMBER = 0
else:
COLUMN_TYPE.append('VARCHAR(2500)')
STRING = 0
COUNTER = COUNTER + 1
# print(COUNTER)
COUNTER = 0
''' BUILD SCHEMA '''
while COUNTER < NUMBER_OF_COLUMNS:
if COUNTER == 0:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' NOT NULL,'
else:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' ,'
COUNTER += 1
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY[:-2]+ ')'
return CREATE_TABLE_QUERY
def COPY_COMMAND():
S3_PATH = 's3://' + S3_DOWNLOAD_PATH
COPY_COMMAND = "COPY "+REDSHIFT_SCHEMA+"."+TABLE_NAME+" from '"+S3_PATH+"' credentials 'aws_access_key_id="+AWS_KEY+";aws_secret_access_key="+AWS_SECRET_KEY+"' REGION 'us-west-2' csv delimiter ',' ignoreheader as 1 TRIMBLANKS maxerror as 500"
return COPY_COMMAND
def S3TOREDSHIFT():
conn = psycopg2.connect("dbname='xxx' port='5439' user='xxx' host='xxxxxx' password='xxxxx'")
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS '+ REDSHIFT_SCHEMA + "." + TABLE_NAME)
SCHEMA = TRAVERSEDATA()
print(SCHEMA)
cursor.execute(SCHEMA)
COPY = COPY_COMMAND()
print(COPY)
cursor.execute(COPY)
conn.commit()
S3TOREDSHIFT()
Current Challenges:
Challenges with creating the table structure :
Field lengths : Right now I am just hardcoding the VARCHAR fields to 2500. All my files are > 30gb and parsing through the whole file to calculate length of a field takes lot of processing time.
Determining if a column is null: I am simply hard coding the first column to NOT NULL using the COUNTER variable. ( All my files have ID as first column ). Would like to know if there is a better way of doing it.
Is there any data structure I can use? I am always interested in learning new ways to improve the performance, if you guys have any suggestions please feel free to comment.

Categories