Hello to all passionate programmers out there. I need your help with my code.
My Goal: To efficiently move data from Amazon S3 to Amazon Redshift.
Basically I am moving all CSV files on my S3 to Redshift using the below code. I parse through part of the file, build a table structure and then use the copy command to load data into redshift.
'''
Created on Feb 25, 2015
#author: Siddartha.Reddy
'''
import sys
from boto.s3 import connect_to_region
from boto.s3.connection import Location
import csv
import itertools
import psycopg2
''' ARGUMENTS TO PASS '''
AWS_KEY = sys.argv[1]
AWS_SECRET_KEY = sys.argv[2]
S3_DOWNLOAD_PATH = sys.argv[3]
REDSHIFT_SCHEMA = sys.argv[4]
TABLE_NAME = sys.argv[5]
UTILS = S3_DOWNLOAD_PATH.split('/')
class UTIL():
global UTILS
def bucket_name(self):
self.BUCKET_NAME = UTILS[0]
return self.BUCKET_NAME
def path(self):
self.PATH = ''
offset = 0
for value in UTILS:
if offset == 0:
offset += 1
else:
self.PATH = self.PATH + value + '/'
return self.PATH[:-1]
def GETDATAINMEMORY():
conn = connect_to_region(Location.USWest2,aws_access_key_id = AWS_KEY,
aws_secret_access_key = AWS_SECRET_KEY,
is_secure=False,host='s3-us-west-2.amazonaws.com'
)
ut = util()
BUCKET_NAME = ut.bucket_name()
PATH = ut.path()
filelist = conn.lookup(BUCKET_NAME)
''' Fecth part of the data from S3 '''
for path in filelist:
if PATH in path.name:
DATA = path.get_contents_as_string(headers={'Range': 'bytes=%s-%s' % (0,100000000)})
return DATA
def TRAVERSEDATA():
DATA = getdatainmemory()
CREATE_TABLE_QUERY = 'CREATE TABLE ' + REDSHIFT_SCHEMA + '.' + TABLE_NAME + '( '
JUNKED_OUT = DATA[3:]
PROCESSED_DATA = JUNKED_OUT.split('\n')
CSV_DATA = csv.reader(PROCESSED_DATA,delimiter=',')
COUNTER,STRING,NUMBER = 0,0,0
COLUMN_TYPE = []
''' GET COLUMN NAMES AND COUNT '''
for line in CSV_DATA:
NUMBER_OF_COLUMNS = len(line)
COLUMN_NAMES = line
break;
''' PROCESS COLUMN NAMES '''
a = 0
for REMOVESPACE in COLUMN_NAMES:
TEMPHOLDER = REMOVESPACE.split(' ')
temp1 = ''
for x in TEMPHOLDER:
temp1 = temp1 + x
COLUMN_NAMES[a] = temp1
a = a + 1
''' GET COLUMN DATA TYPES '''
# print(NUMBER_OF_COLUMNS,COLUMN_NAMES,COUNTER)
# print(NUMBER_OF_COLUMNS)
i,j,a= 0,500,0
while COUNTER < NUMBER_OF_COLUMNS:
for COLUMN in itertools.islice(CSV_DATA,i,j+1):
if COLUMN[COUNTER].isdigit():
NUMBER = NUMBER + 1
else:
STRING = STRING + 1
if NUMBER == 501:
COLUMN_TYPE.append('INTEGER')
# print('I CAME IN')
NUMBER = 0
else:
COLUMN_TYPE.append('VARCHAR(2500)')
STRING = 0
COUNTER = COUNTER + 1
# print(COUNTER)
COUNTER = 0
''' BUILD SCHEMA '''
while COUNTER < NUMBER_OF_COLUMNS:
if COUNTER == 0:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' NOT NULL,'
else:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' ,'
COUNTER += 1
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY[:-2]+ ')'
return CREATE_TABLE_QUERY
def COPY_COMMAND():
S3_PATH = 's3://' + S3_DOWNLOAD_PATH
COPY_COMMAND = "COPY "+REDSHIFT_SCHEMA+"."+TABLE_NAME+" from '"+S3_PATH+"' credentials 'aws_access_key_id="+AWS_KEY+";aws_secret_access_key="+AWS_SECRET_KEY+"' REGION 'us-west-2' csv delimiter ',' ignoreheader as 1 TRIMBLANKS maxerror as 500"
return COPY_COMMAND
def S3TOREDSHIFT():
conn = psycopg2.connect("dbname='xxx' port='5439' user='xxx' host='xxxxxx' password='xxxxx'")
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS '+ REDSHIFT_SCHEMA + "." + TABLE_NAME)
SCHEMA = TRAVERSEDATA()
print(SCHEMA)
cursor.execute(SCHEMA)
COPY = COPY_COMMAND()
print(COPY)
cursor.execute(COPY)
conn.commit()
S3TOREDSHIFT()
Current Challenges:
Challenges with creating the table structure :
Field lengths : Right now I am just hardcoding the VARCHAR fields to 2500. All my files are > 30gb and parsing through the whole file to calculate length of a field takes lot of processing time.
Determining if a column is null: I am simply hard coding the first column to NOT NULL using the COUNTER variable. ( All my files have ID as first column ). Would like to know if there is a better way of doing it.
Is there any data structure I can use? I am always interested in learning new ways to improve the performance, if you guys have any suggestions please feel free to comment.
Related
Im using Regex to match the following excel file and Im struggling with how I can
seperate each row by
Timestamp [0:00:48],
ID 20052A
and the content content (more content)
This is the excel row (one of many, so the ID can vary from row to row and the timestamp as well as the content too)
[0:00:48] 20052A: content (more content)
I get an Error code
AttributeError: 'NoneType' object has no attribute 'group
for matching my ID where I have
(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c)
Keep in mind that from time to time the ID looks something like this
[0:00:33] 30091aA: (content)
My whole skript is (cancel out the connection to database)
import os
import re
import pymysql
pymysql.install_as_MySQLdb()
import pandas as pd
import sqlalchemy
def insert_or_update(engine, pd_table, table_name):
inserts = 0
updates = 0
for i in range(len(pd_table)):
vals_with_quotes = ["'" + str(x) + "'" for x in pd_table.loc[i, :].values]
# print(vals_with_quotes)
update_pairs = [str(c) + " = '" + str(v) + "'" for c, v in zip(pd_table.columns, pd_table.loc[i, :])]
query = f"INSERT INTO {table_name} ({', '.join(list(pd_table.columns.values))}) " \
f"VALUES ({', '.join(vals_with_quotes)}) " \
f"ON DUPLICATE KEY UPDATE {', '.join(update_pairs)}"
print(query)
result = engine.execute(query)
if result.lastrowid == 0:
updates += 1
else:
inserts += 1
print(f"Inserted {inserts} rows and updated {updates} rows.")
schema = '---'
alchemy_connect = "---"
engine = sqlalchemy.create_engine(alchemy_connect) # connect to server
engine.execute(f"USE {schema}") # select new db
# engine.execute("SET NAMES UTF8MB4;")
query = "SELECT * FROM .... where ...=..."
pm = pd.read_sql(query, engine)
rootpath = "path/"
for root, dirs, files in os.walk(rootpath):
for file in files:
print(root, dirs, files, file)
d = pd.read_excel(root + file, header=None)
d.drop(columns=[0], inplace=True)
d.rename(columns={1: "content"}, inplace=True)
participants = []
for ix, row in d.iterrows():
c = row["content"]
match = re.search(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c)
prefix = match.group(1)
only_content = match.group(2)
try:
timestamp = re.search(r"\[(\d{1,2}:\d{1,2}:\d{1,2})\]", prefix).group(1)
except:
timestamp = "-99"
# print(timestamp)
if re.search(r"\s(Versuchsleiter|ersuchsleiter|Versuchsleit|Versuch):", prefix):
id_code = "Versuchsleiter"
else:
starting_digits = re.search(r"^(\d+)", prefix)
id_code = re.search(r"(\d{2,4}.{1,3}):", prefix).group(1)
if hasattr(starting_digits, 'group'):
id_code = starting_digits.group(1) + id_code #
# get pid
participant = pm.loc[pm["id_code"] == id_code, "pid"]
try:
pid = participant.values[0]
except:
pid = "Versuchsleiter"
# print(ix, pid, id_code, only_content, timestamp)
if pid and pid not in participants and pid != "Versuchsleiter":
participants.append(pid)
d.loc[ix, "pid"] = pid
d.loc[ix, "timestamp"] = timestamp
d.loc[ix, "content"] = only_content.strip()
d.loc[ix, "is_participant"] = 0 if pid == "Versuchsleiter" else 1
d = d[["pid", "is_participant", "content", "timestamp"]]
d.loc[(d['pid'] == "Versuchsleiter"), "pid"] = participants[0]
d.loc[(d['pid'] == None), "pid"] = participants[0]
insert_or_update(engine, d, "table of sql")```
I need "Versuchsleiter" since some of the ID's are "Versuchsleiter"
Thank you!
You should take advantage from using capturing groups.
All the initial regex matching (after c = row["content"] and before # get pid) can be done with
match = re.search(r"^\[(\d{1,2}:\d{1,2}:\d{1,2})]\s+(\w+):\s*(.*)", c)
if match:
timestamp = match.group(1)
id_code = match.group(2)
only_content = match.group(3)
if re.search(r"(?:Versuch(?:sleit(?:er)?)?|ersuchsleiter)", id_code):
id_code = "Versuchsleiter"
Your timestamp will be 0:00:33, only_content will hold (content) and id_code will contain 30091aA.
See the regex demo
Thank you for your help but this gives me the following error
Traceback (most recent call last):
File "C:/Users/.../PycharmProjects/.../.../....py", line 80, in <module>
insert_or_update(engine, d, "sql table")
TypeError: not enough arguments for format string
This is a piece of code which needs to perform the follow functionality:
Dump all table names in a database
From each table search for a column with either Latitude or Longitude in
Store these co-ords as a json file
The code was tested and working on a single database. However once it was put into another piece of code which calls it with different databases it now is not entering line 49. However there is no error either so I am struggling to see what the issue is as I have not changed anything.
Code snippet line 48 is the bottom line -
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print (cursor)
for tablerow in cursor.fetchall():
I am running this in the /tmp/ dir due to an earlier error with sqlite not working outside the temp.
Any questions please ask them.
Thanks!!
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3
import os
import sys
filename = sys.argv[1]
def validateFile(filename):
filename, fileExt = os.path.splitext(filename)
print ("[Jconsole] Python: Filename being tested - " + filename)
if fileExt == '.db':
databases(filename)
elif fileExt == '.json':
jsons(fileExt)
elif fileExt == '':
blank()
else:
print ('Unsupported format')
print (fileExt)
def validate(number):
try:
number = float(number)
if -90 <= number <= 180:
return True
else:
return False
except ValueError:
pass
def databases(filename):
dbName = sys.argv[2]
print (dbName)
idCounter = 0
mainList = []
lat = 0
lon = 0
with sqlite3.connect(filename) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print (cursor)
for tablerow in cursor.fetchall():
print ("YAY1")
table = tablerow[0]
cursor.execute('SELECT * FROM {t}'.format(t=table))
for row in cursor:
print(row)
print ("YAY")
tempList = []
for field in row.keys():
tempList.append(str(field))
tempList.append(str(row[field]))
for i in tempList:
if i in ('latitude', 'Latitude'):
index = tempList.index(i)
if validate(tempList[index + 1]):
idCounter += 1
tempList.append(idCounter)
(current_item, next_item) = \
(tempList[index], tempList[index + 1])
lat = next_item
if i in ('longitude', 'Longitude'):
index = tempList.index(i)
if validate(tempList[index + 1]):
(current_item, next_item) = \
(tempList[index], tempList[index + 1])
lon = next_item
result = '{ "id": ' + str(idCounter) \
+ ', "content": "' + dbName + '", "title": "' \
+ str(lat) + '", "className": "' + str(lon) \
+ '", "type": "box"},'
mainList.append(result)
file = open('appData.json', 'a')
for item in mainList:
file.write('%s\n' % item)
file.close()
# {
# ...."id": 1,
# ...."content": "<a class='thumbnail' href='./img/thumbs/thumb_IMG_20161102_151122.jpg'>IMG_20161102_151122.jpg</><span><img src='./img/thumbs/thumb_IMG_20161102_151122.jpg' border='0' /></span></a>",
# ...."title": "50.7700721944444",
# ...."className": "-0.8727045",
# ...."start": "2016-11-02 15:11:22",
# ...."type": "box"
# },
def jsons(filename):
print ('JSON')
def blank():
print ('blank')
validateFile(filename)
Fixed.
The issue was up here
filename, fileExt = os.path.splitext(filename)
The filename variable was being overwritten without the file extension so when SQLite searched it didn't find the file.
Strange no error appeared but it is fixed now by changing the filename var to filename1.
I'm programming a script that connects to an Oracle database and get the results into a log file. I want to get a output like this:
FEC_INCLUSION = 2005-08-31 11:43:48,DEBITO_PENDIENTE = None,CAN_CUOTAS = 1.75e-05,COD_CUENTA = 67084,INT_TOTAL = None,CAN_CUOTAS_ANTERIOR = None,COD_INVERSION = 1,FEC_MODIFICACION = 10/04/2012 09:45:22,SAL_TOT_ANTERIOR = None,CUOTA_COMISION = None,FEC_ULT_CALCULO = None,MODIFICADO_POR = CTAPELA,SAL_TOTAL = 0.15,COD_TIPSALDO = 1,MONTO_COMISION = None,COD_EMPRESA = 1,SAL_INFORMATIVO = None,COD_OBJETIVO = 5,SAL_RESERVA = None,INCLUIDO_POR = PVOROPE,APORTE_PROM = 0.0,COSTO_PROM = None,CREDITO_PENDIENTE = None,SAL_PROM = 0.0,
FEC_INCLUSION = 2005-08-31 11:43:49,DEBITO_PENDIENTE = None,CAN_CUOTAS = 0.0,COD_CUENTA = 67086,INT_TOTAL = None,CAN_CUOTAS_ANTERIOR = None,COD_INVERSION = 9,FEC_MODIFICACION = 25/02/2011 04:38:52,SAL_TOT_ANTERIOR = None,CUOTA_COMISION = None,FEC_ULT_CALCULO = None,MODIFICADO_POR = OPEJAMO,SAL_TOTAL = 0.0,COD_TIPSALDO = 1,MONTO_COMISION = None,COD_EMPRESA = 1,SAL_INFORMATIVO = None,COD_OBJETIVO = 5,SAL_RESERVA = None,INCLUIDO_POR = PVOROPE,APORTE_PROM = 0.0,COSTO_PROM = None,CREDITO_PENDIENTE = None,SAL_PROM = 0.0,
I created a dictionary with the query results:
def DictFactory(description,data):
column_names = [col[0] for col in description]
results = []
for row in data:
results.append(dict(zip(column_names,row)))
return results
Then I created this function which finally save the results into my log:
def WriteLog(log_file,header,data):
file_exist = os.path.isfile(log_file)
log = open(log_file,'a')
if not file_exist:
print "File does not exist, writing new log file"
open(log_file,'w').close()
mydata = DictFactory(header,data)
checkpoint_name = ReadCheckpointName()
string = ''
for m in mydata:
for k,v in m.items():
string = string + k + ' = ' + str(v) + ','
if k == checkpoint_name:
#print "KEY FOUND"
cur_checkpoint = v
cur_checkpoint = str(cur_checkpoint)
#print string
string = string + '\n'
print cur_checkpoint
log.write(string + '\n')
WriteCheckpoint(cur_checkpoint,checkpoint_file)
log.close()
This is the main function:
def GetInfo():
mypool = PoolToDB()
con = mypool.acquire()
cursor = con.cursor()
GetLastCheckpoint()
sql = ReadQuery()
#print sql
cursor.execute(sql)
data = cursor.fetchall()
WriteLog(log_file,cursor.description,data)
#WriteCsvLog(log_file,cursor.description,data)
cursor.close()
But I realized that it works if I use a query that fetch few records, however if I try to fetch many records my script never ends.
This is my output when I executed a query with 5000 records. As you can see it takes too long.
jballesteros#SplunkPorvenir FO_TIPSALDOS_X_CUENTA]$ python db_execution.py
Starting connection: 5636
GetLastCheckpoint function took 0.073 ms
GetLastCheckpoint function took 0.025 ms
ReadQuery function took 0.084 ms
File does not exist, writing new log file
DictFactory function took 23.050 ms
ReadCheckpointName function took 0.079 ms
WriteCheckpoint function took 0.204 ms
WriteLog function took 45112.133 ms
GetInfo function took 46193.033 ms
I'm pretty sure you know a much better way to do what I am trying to do.
This is the complete code:
#!/usr/bin/env python
# encoding: utf-8
import re
import sys
try:
import cx_Oracle
except:
print "Error: Oracle module required to run this plugin."
sys.exit(0)
import datetime
import re
import commands
import os
from optparse import OptionParser
import csv
import time
#################################
#### Database Variables ####
#################################
Config = {
"host" : "",
"user" : "",
"password" : "",
"instance" : "",
"port" : "",
}
Query = {
"sql" : "",
"checkpoint_datetype" : "",
"checkpoint_name" : "",
}
dir = '/home/jballesteros/PENS2000/FO_TIPSALDOS_X_CUENTA/'
connection_dir = '/home/jballesteros/PENS2000/Connection'
checkpoint_file = dir + 'checkpoint.conf'
log_file = '/var/log/Pens2000/FO_TIPSALDOS_X_CUENTA.csv'
internal_log = '/var/log/Pens2000/internal.log'
query = dir + 'query'
sys.path.append(os.path.abspath(connection_dir))
from db_connect_pool import *
def Timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print "%s function took %0.3f ms" % (f.func_name,(time2- time1)*1000.0)
return ret
return wrap
#Timing
def InternalLogWriter(message):
now = datetime.datetime.now()
log = open(internal_log, 'a')
log.write("%s ==> %s" % (now.strftime("%Y-%m-%d %H:%M:%S"),message))
log.close()
return
#Timing
def GetLastCheckpoint():
global cur_checkpoint
conf = open(checkpoint_file, 'r')
cur_checkpoint = conf.readline()
cur_checkpoint = cur_checkpoint.rstrip('\n')
cur_checkpoint = cur_checkpoint.rstrip('\r')
conf.close()
#Timing
def ReadQuery():
global cur_checkpoint
GetLastCheckpoint()
qr = open(query, 'r')
line = qr.readline()
line = line.rstrip('\n')
line = line.rstrip('\r')
Query["sql"], Query["checkpoint_datetype"],Query["checkpoint_name"] = line.split(";")
sql = Query["sql"]
checkpoint_datetype = Query["checkpoint_datetype"]
checkpoint_name = Query["checkpoint_name"]
if (checkpoint_datetype == "DATETIME"):
sql = sql + " AND " + checkpoint_name + " >= " + "TO_DATE('%s','YYYY-MM-DD HH24:MI:SS') ORDER BY %s" % (cur_checkpoint,checkpoint_name)
if (checkpoint_datetype == "NUMBER"):
sql = sql + " AND " + checkpoint_name + " > " + "%s ORDER BY %s" % (cur_checkpoint,checkpoint_name)
qr.close()
return str(sql)
#Timing
def ReadCheckpointName():
qr = open(query, 'r')
line = qr.readline()
line = line.rstrip('\n')
line = line.rstrip('\r')
Query["sql"], Query["checkpoint_datetype"],Query["checkpoint_name"] = line.split(";")
checkpoint_name = Query["checkpoint_name"]
return str(checkpoint_name)
#Timing
def LocateCheckPoint(description):
description
checkpoint_name = ReadCheckpointName()
#print checkpoint_name
#print description
startcounter = 0
finalcounter = 0
flag = 0
for d in description:
prog = re.compile(checkpoint_name)
result = prog.match(d[0])
startcounter = startcounter + 1
if result:
finalcounter = startcounter - 1
counterstr = str(finalcounter)
print "Checkpoint found in the array position number: " + counterstr
flag = 1
if (flag == 0):
print "Checkpoint did not found"
return finalcounter
#Timing
def DictFactory(description,data):
column_names = [col[0] for col in description]
results = []
for row in data:
results.append(dict(zip(column_names,row)))
return results
#Timing
def WriteCsvLog(log_file,header,data):
checkpoint_index = LocateCheckPoint(header)
file_exists = os.path.isfile(log_file)
with open(log_file,'ab') as csv_file:
headers = [i[0] for i in header]
csv_writer = csv.writer(csv_file,delimiter='|')
if not file_exists:
print "File does not exist, writing new CSV file"
csv_writer.writerow(headers) # Writing headers once
for d in data:
csv_writer.writerow(d)
cur_checkpoint = d[checkpoint_index]
cur_checkpoint = str(cur_checkpoint)
WriteCheckpoint(cur_checkpoint,checkpoint_file)
csv_file.close()
#Timing
def WriteLog(log_file,header,data):
file_exist = os.path.isfile(log_file)
log = open(log_file,'a')
if not file_exist:
print "File does not exist, writing new log file"
open(log_file,'w').close()
mydata = DictFactory(header,data)
checkpoint_name = ReadCheckpointName()
#prin #string = ''
for m in mydata:
for k,v in m.items():
string = string + k + ' = ' + str(v) + ','
if k == checkpoint_name:
#print "KEY FOUND"
cur_checkpoint = v
cur_checkpoint = str(cur_checkpoint)
#print string
string = string + '\n'
print cur_checkpoint
log.write(string + '\n')
WriteCheckpoint(cur_checkpoint,checkpoint_file)
log.close()
#Timing
def WriteCheckpoint(cur_checkpoint,conf_file):
conf = open(conf_file,'w')
conf.write(cur_checkpoint)
conf.close()
#Timing
def GetInfo():
mypool = PoolToDB()
con = mypool.acquire()
cursor = con.cursor()
GetLastCheckpoint()
sql = ReadQuery()
#print sql
cursor.execute(sql)
#data = cursor.fetchall()
#WriteLog(log_file,cursor.description,data)
#WriteCsvLog(log_file,cursor.description,data)
cursor.close()
def __main__():
parser = OptionParser()
parser.add_option("-c","--change- password",dest="pass_to_change",help="Change the password for database connection",metavar="1")
(options, args) = parser.parse_args()
if (options.pass_to_change):
UpdatePassword()
else:
GetInfo()
__main__()
This is a query sample:
SELECT COD_EMPRESA, COD_TIPSALDO, COD_INVERSION, COD_CUENTA, COD_OBJETIVO, CAN_CUOTAS, SAL_TOTAL, INT_TOTAL, SAL_RESERVA, APORTE_PROM, SAL_PROM, COSTO_PROM, SAL_TOT_ANTERIOR, FEC_ULT_CALCULO, INCLUIDO_POR, FEC_INCLUSION, MODIFICADO_POR, TO_CHAR(FEC_MODIFICACION,'DD/MM/YYYY HH24:MI:SS') AS FEC_MODIFICACION, CUOTA_COMISION, MONTO_COMISION, SAL_INFORMATIVO, CREDITO_PENDIENTE, DEBITO_PENDIENTE, CAN_CUOTAS_ANTERIOR FROM FO.FO_TIPSALDOS_X_CUENTA WHERE ROWNUM <=100000 AND FEC_INCLUSION >= TO_DATE('2005-08-31 11:43:49','YYYY-MM-DD HH24:MI:SS') ORDER BY FEC_INCLUSION
PS: I've really been searching in google and this forum about my question but I haven't found anything similar.
From http://fuzzytolerance.info/blog/2012/01/13/2012-01-14-updating-google-fusion-table-from-a-csv-file-using-python/ I have edited his code to import the necessary modules, however I get the following error "AttributeError: 'module' object has no attribute 'urlencode'". I run the code and I am prompted to enter my password, I enter my own google account password, and then the code gives me the error message, pehaps I need to define a password somewhere?
I wonder if anyone can please trouble shoot my code or advise me on how to avoid this error or even advise me of an EASIER way to import a CSV into a GOOGLE FUSION TABLE that I OWN
Here is my code
import csv
from decimal import *
import getpass
from fusiontables.authorization.clientlogin import ClientLogin
from fusiontables import ftclient
nameAgeNick = 'C:\\Users\\User\\Desktop\\NameAgeNickname.txt'
# check to see if something is an integer
def isInt(s):
try:
int(s)
return True
except ValueError:
return False
# check to see if something is a float
def isFloat(s):
try:
float(s)
return True
except ValueError:
return False
# open the CSV file
ifile = open(nameAgeNick, "rb")
reader = csv.reader(ifile)
# GFT table ID
tableID = "tableid"
# your username
username = "username"
# prompt for your password - you can hardcode it but this is more secure
password = getpass.getpass("Enter your password:")
# Get token and connect to GFT
token = ClientLogin().authorize(username, password)
ft_client = ftclient.ClientLoginFTClient(token)
# Loop through the CSV data and upload
# Assumptions for my data: if it's a float less than 0, it's a percentage
# Floats are being rounded to 1 significant digit
# Non-numbers are wrapped in a single quote for string-type in the updatate statement
# The first row is the column names and matches exactly the column names in Fustion tables
# The first column is the unique ID I'll use to select the record for updating in Fusion Tables
rownum = 0
setList = list()
nid = 0
for row in reader:
# Save header row.
if rownum == 0:
header = row
else:
colnum = 0
setList[:] = []
for col in row:
thedata = col
# This bit rounds numbers and turns numbers < 1 into percentages
if isFloat(thedata):
if isInt(thedata) is False:
if float(thedata) < 1:
thedata = float(thedata) * 100
thedata = round(float(thedata), 1)
else:
thedata = "'" + thedata + "'"
# make sql where clause for row
setList.append(header[colnum] + "=" + str(thedata))
nid = row[0]
colnum += 1
# get rowid and update the record
rowid = ft_client.query("select ROWID from " + tableID + " where ID = " + nid).split("\n")[1]
print( rowid)
print( ft_client.query("update " + tableID + " set " + ",".join(map(str, setList)) + " where rowid = '" + rowid + "'"))
rownum += 1
ifile.close()
And this is the module where the error occurs:
#!/usr/bin/python
#
# Copyright (C) 2010 Google Inc.
""" ClientLogin.
"""
__author__ = 'kbrisbin#google.com (Kathryn Brisbin)'
import urllib, urllib2
class ClientLogin():
def authorize(self, username, password):
auth_uri = 'https://www.google.com/accounts/ClientLogin'
authreq_data = urllib.urlencode({ //////HERE IS ERROR
'Email': username,
'Passwd': password,
'service': 'fusiontables',
'accountType': 'HOSTED_OR_GOOGLE'})
auth_req = urllib2.Request(auth_uri, data=authreq_data)
auth_resp = urllib2.urlopen(auth_req)
auth_resp_body = auth_resp.read()
auth_resp_dict = dict(
x.split('=') for x in auth_resp_body.split('\n') if x)
return auth_resp_dict['Auth']
My python script checks mysqldump and if any problems script prints :
Dump is old for db;
Dump is not complete for db;
Dump is empty for db;
MySQL dump does not exist for db;
Script logs these records to the file line by line.
My question is there are a way to format output in the file like:
Dump is old for db;
Dump is old for db;
Dump is old for db;
Dump is not complete for db;
Dump is not complete for db;
Dump is not complete for db;
Dump is empty for db;
Dump is empty for db;
Dump is empty for db;
Because now my file looks like:
Dump is old for db;
Dump is empty for db;
Dump is old for db;
MySQL dump does not exist for db;
...
etc
Here my small script :)
#!/bin/env python
import psycopg2
import sys,os
from subprocess import Popen, PIPE
from datetime import datetime
import smtplib
con = None
today = datetime.now().strftime("%Y-%m-%d")
log_dump_fail = '/tmp/mysqldump_FAIL'
log_fail = open(log_dump_fail,'w').close()
log_fail = open(log_dump_fail, 'a')
sender = 'PUT_SENDER_NAME_HERE'
receiver = ['receiver_name']
smtp_daemon_host = 'localhost'
def db_backup_file_does_not_exist(db_backup_file):
if not os.path.exists(db_backup_file): return True
else: return False
def dump_health(last_dump_row, file_name,db):
last_row = last_dump_row.rsplit(" ")
tms = ''.join(last_row[4:5])
status = last_row[1:3]
if (status) and (tms != today):
log_fail.write("\nDB is old for "+ str(db) + str(file_name) + ", \nDump finished at " + str(''.join(tms)))
log_fail.write("\n-------------------------------------------")
elif not (status) and (tms == None):
log_fail.write("\nDump is not complete for "+str(db) + str(file_name) + " , end of file is not correct")
log_fail.write("\n-------------------------------------------")
suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
def humansize(nbytes):
if nbytes == 0: return '0 B'
i = 0
while nbytes >= 1024 and i < len(suffixes)-1:
nbytes /= 1024.
i += 1
f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
return '%s %s' % (f, suffixes[i])
def dump_size(dump_file, file_name,db):
size = os.path.getsize(dump_file)
if (size < 1024):
human_readable = humansize(size)
log_fail.write("\nDump is empty for " +str(db) + "\n" +"\t" + str (file_name)+", file size is " + str(human_readable))
log_fail.write("\n-------------------------------------------")
def report_to_noc(isubject,text):
TEXT = text
SUBJECT = subject
message = 'Subject: %s\n\n%s' % (SUBJECT, TEXT)
server = smtplib.SMTP(smtp_daemon_host)
server.sendmail(sender, receiver, message)
server.quit()
try:
con = psycopg2.connect(database='**', user='***', password='***', host='****')
cur = con.cursor()
cur.execute("""\
select ad.servicename, (select name from servers where id = ps.server_id) as servername
from packages as p, account_data as ad, package_servers as ps
where p.id=ad.package_id and
p.date_deleted IS NULL and
p.id=ps.package_id and
p.aktuel IS NULL and
p.pre_def_package_id = 4 and
p.mother_package_id !=0 and
ps.subservice_id=5 and
p.mother_package_id NOT IN (select id from packages where date_deleted IS NOT NULL)
ORDER BY servername;
""")
while (1):
row = cur.fetchone ()
if row == None:
break
db = row[0]
server_name = str(row[1])
if (''.join(server_name) == 'SKIP_THIS') or (''.join(server_name) == 'SKIP_THIS'):
continue
else:
db_backup_file = '/storage/backup/db/mysql/' + str(db) + '/current/' + str(db) + '.mysql.gz'
db_backup_file2 = '/storage/backup/' + str(''.join(server_name.split("DB"))) + '/mysql/' + str(db) + '/current/'+ str(db) + '.mysql.gz'
db_file_does_not_exist = False
db_file2_does_not_exist = False
if db_backup_file_does_not_exist(db_backup_file):
db_file_does_not_exist = True
if db_backup_file_does_not_exist(db_backup_file2):
db_file2_does_not_exist = True
if db_file_does_not_exist and db_file2_does_not_exist:
log_fail.write("\nMySQL dump does not exist for " + str(db) + "\n" + "\t" + str(db_backup_file2) + "\n" + "\t" + str(db_backup_file))
log_fail.write("\n-------------------------------------------")
continue
elif (db_file_does_not_exist) and not (db_file2_does_not_exist):
p_zcat = Popen(["zcat", db_backup_file2], stdout=PIPE)
p_tail = Popen(["tail", "-2"], stdin=p_zcat.stdout, stdout=PIPE)
dump_status = str(p_tail.communicate()[0])
dump_health(dump_status,db_backup_file2,db)
dump_size(db_backup_file2, db_backup_file2,db)
elif (db_file2_does_not_exist) and not (db_file_does_not_exist):
p_zcat = Popen(["zcat", db_backup_file], stdout=PIPE)
p_tail = Popen(["tail", "-2"], stdin=p_zcat.stdout, stdout=PIPE)
dump_status = str(p_tail.communicate()[0])
dump_health(dump_status,db_backup_file,db)
dump_size(db_backup_file,db_backup_file,db)
con.close()
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
log_fail.close()
if os.path.getsize(log_dump_fail) > 0:
subject = "Not all MySQL dumps completed successfully. Log file backup:" + str(log_dump_fail)
fh = open(log_dump_fail, 'r')
text = fh.read()
fh.close()
report_to_noc(subject,text)
else:
subject = "MySQL dump completed successfullyi for all DBs, listed in PC"
text = "Hello! \nI am notifying you that I checked mysqldump files this morning.\nThere are nothing to worry about. :)"
report_to_noc(subject,text)
You can process your log file after it has been written.
One option is to read your file and sort the lines:
lines = open('log.txt').readlines()
lines.sort()
open('log_sorted.txt', 'w').write("\n".join(lines))
This won't emit an empty line between log types.
Another option is to use a Counter:
from collections import Counter
lines = open('log.txt').readlines()
counter = Counter()
for line in lines:
counter[line] += 1
out_file = open('log_sorted.txt', 'w')
for line, num in counter.iteritems():
out_file.write(line * num + "\n")
Looks like you want to group the output of the script, rather than log the info as it comes while searching.
Easiest would be to maintain 4 lists, on each for empty, not empty and so on. In the script add the db names to appropriate list instead of logging, and then dump the lists one by one into the file with appropriate prefixes("not empty for" + dbname).
For example, remove all the log_fail.write() from the functions and replace them with list.append() and write a separate function that writes to the log file as you like:
Add lists:
db_dump_is_old_list = []
db_dump_is_empty_list = []
db_dump_is_not_complete_list = []
db_dump_does_not_exist_list = []
Modify the Functions:
def dump_health(last_dump_row, file_name,db):
last_row = last_dump_row.rsplit(" ")
tms = ''.join(last_row[4:5])
status = last_row[1:3]
if (status) and (tms != today):
db_dump_is_old_list.append(str(db))
#log_fail.write("\nDB is old for "+ str(db) + str(file_name) + ", \nDump finished at " + str(''.join(tms)))
#log_fail.write("\n-------------------------------------------")
elif not (status) and (tms == None):
db_dump_is_not_complete_list.append(str(db)
#log_fail.write("\nDump is not complete for "+str(db) + str(file_name) + " , end of file is not correct")
#log_fail.write("\n-------------------------------------------")
def dump_size(dump_file, file_name,db):
size = os.path.getsize(dump_file)
if (size < 1024):
human_readable = humansize(size)
db_dump_is_empty_list.append(str(db))
#log_fail.write("\nDump is empty for " +str(db) + "\n" +"\t" + str (file_name)+", file size is " + str(human_readable))
#log_fail.write("\n-------------------------------------------")
if db_file_does_not_exist and db_file2_does_not_exist:
db_dump_does_not_exist_list.append(str(db))
#log_fail.write("\nMySQL dump does not exist for " + str(db) + "\n" + "\t" + str(db_backup_file2) + "\n" + "\t" + str(db_backup_file))
#log_fail.write("\n-------------------------------------------")
continue
And add a logger function:
def dump_info_to_log_file():
log_dump_fail = '/tmp/mysqldump_FAIL'
log_fail = open(log_dump_fail,'w').close()
log_fail = open(log_dump_fail, 'a')
for dbname in db_dump_is_old_list:
log_fail.write("Dump is Old for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_is_empty_list:
log_fail.write("Dump is Empty for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_is_not_complete_list:
log_fail.write("Dump is Not Complete for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_does_not_exist_list:
log_fail.write("Dump Does Not Exist for" + str(dbname))
log_fail.close()
Or you could simply log as you are doing, and then read in the file, sort and write back the file.
Thank you all for all interesting ideas.
I have really tried all options :)
To my mind:
With Counter object the pros is to few lines of code.
But cons are - many read\write operations. Log file is not big, however, I decided to decrease read(s) \ write(s)
With array the cons are to many lines of code :) but the pros is - write to the file only once.
So I implemented arrays.. :)
Thank you guys!!!