I am trying to export the whole database schema (around 20 GB) using postgreSQL query to create a final unique hdf5 file.
Because this size don't fit on my computers memory, I am using chuncks argument.
First I use this function to establish conection:
def make_connectstring(prefix, db, uname, passa, hostname, port):
"""return an sql connectstring"""
connectstring = prefix + "://" + uname + ":" + passa + "#" + hostname + \
":" + port + "/" + db
return connectstring
Then I created a temporary folder to save each of hdf5 file.
def query_to_hdf5(connectstring, query, verbose=False, chunksize=50000):
engine = sqlalchemy.create_engine(connectstring,
server_side_cursors=True)
# get the data to temp chunk filese
i = 0
paths_chunks = []
with tempfile.TemporaryDirectory() as td:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
path = td + "/chunk" + str(i) + ".hdf5"
df.to_hdf(path, key='data')
print(path)
if verbose:
print("wrote", path)
paths_chunks.append(path)
i+=1
connectstring = make_connectstring(prefix, db, uname, passa, hostname, port)
query = "SELECT * FROM public.zz_ges"
df = query_to_hdf5(connectstring, query)
What is the best way to merge all these files into 1 single file that represents the whole dataframe ?
I tried something like this :
df = pd.DataFrame()
print(path)
for path in paths_chunks:
df_scratch = pd.read_hdf(path)
df = pd.concat([df, df_scratch])
if verbose:
print("read", path)
However, the memory goes up very fast. I need something that could be more efficient.
Update:
def make_connectstring(prefix, db, uname, passa, hostname, port):
"""return an sql connectstring"""
connectstring = prefix + "://" + uname + ":" + passa + "#" + hostname + \
":" + port + "/" + db
return connectstring
def query_to_df(connectstring, query, verbose=False, chunksize=50000):
engine = sqlalchemy.create_engine(connectstring,
server_side_cursors=True)
# get the data to temp chunk filese
with pd.HDFStore('output.h5', 'w') as store:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
store.append('data', df)
I'd suggest using a HDFStore directly, that way you can append chunks as you get them from the database, something like:
with pd.HDFStore('output.h5', 'w') as store:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
store.append('data', df)
this is based around your existing code so isn't complete, let me know if it isn't clear
note I'm opening the store in w mode so it'll delete the file every time. otherwise append will just keep adding the same rows to the end of the table. alternatively you could remove the key first
when you open the store you also get lots of options like compression to use but it doesn't seem to be well documented, help(pd.HDFStore) describes complevel and complib for me
Related
I'm connecting to oracle database using python script and extracting around 10 tables. one table is having 3Gb of data it took around 4 hours to extract with below code and upload it to S3. How can we improve the performance of the below python script?
Different file format other than csv will improve the performance like parquet?
Any suggestions or solutions will be highly appreciated.
Below is the code I tried:
def extract_handler():
# Parameters defined in cloudwatch event
env = os.environ['Environment'] if 'Environment' in os.environ else 'sit'
# FTP parameters
host = f"/{env}/connet_HOSTNAME"
username = f"/{env}/connect_USERNAME"
password = f"/{env}/connect_PASSWORD"
host = get_parameters(host)
username = get_parameters(username)
password = get_parameters(password)
today = date.today()
current_date = today.strftime("%Y%m%d")
con = None
cur = None
tables = ["table1", "table2","table3"........."table10"]
bucket = "bucket_name"
for table in tables:
try:
con = cx_Oracle.connect(username, password, host, encoding="UTF-8")
cur = con.cursor()
logging.info('Successfully established the connection to Oracle db')
table_name = table.split(".")[1]
logging.info("######## Table name:"+ table +" ###### ")
logging.info("****** PROCESSING:" +table_name+" *********")
cur.execute("SELECT count(*) FROM {}".format(table))
count = cur.fetchone()[0]
logging.info("Count:", count)
if count > 0:
cur1 = con.cursor()
# Define the desired timestamp format
timestamp_format = '%Y/%m/%d %H:%M:%S'
# Execute a query to read a table
cur1.execute( "select * from {} where TRUNC(DWH_CREATED_ON)=TRUNC(SYSDATE)-1".format(table))
batch_size = 10000
rows = cur1.fetchmany(batch_size)
csv_file = f"/tmp/{table_name}.csv"
with open(csv_file, "w", newline="") as f:
# Add file_date column as the first column
writer = csv.DictWriter(f, fieldnames=['file_date'] + [col[0] for col in cur1.description],
delimiter='\t')
writer.writeheader()
logging.info("Header added to the table:" + table + "######")
while rows:
for row in rows:
row_dict = {'file_date': current_date}
for i, col in enumerate(cur1.description):
if col[1] == cx_Oracle.DATETIME:
if row[i] is not None:
row_dict[col[0]] = row[i].strftime(timestamp_format)
else:
row_dict[col[0]] = ""
else:
row_dict[col[0]] = row[i]
with open(csv_file, "a", newline="") as f:
# Add file_date column as the first column
writer = csv.DictWriter(f, fieldnames=['file_date'] + [col[0] for col in cur1.description],
delimiter='\t')
writer.writerow(row_dict)
# Fetch the next batch of 100 rows
rows = cur1.fetchmany(batch_size)
logging.info("Records written to the temp file for the table :" + table + "######")
s3_path = "NorthernRegion" + '/' + table_name + '/' + current_date + '/' + table_name + '.csv'
s3_client = boto3.client('s3', region_name='region-central-1')
s3_client.upload_file('/tmp/' + table_name + '.csv', bucket, s3_path)
logging.info(table + "File uploaded to S3 ######")
else:
logging.info('Table not having data')
return 'Data is not refreshed yet, Hence quitting..'
if cur1:
cur1.close()
except Exception as err:
#Handle or log other exceptions such as bucket doesn't exist
logging.error(err)
finally:
if cur:
cur.close()
if con:
con.close()
return "Successfully processed"
When I run the below code its shows me with index, I dont want the index as I want to pass this in a sql query in another df.
file_loc = path + file
excel = pd.read_excel(file_loc,sheet_name='Sheet1',index_col=None,usecols="A",header=1,names=['id'])
id = pd.DataFrame(excel).reset_index(drop=True)
id['id'] = id['id'].apply(lambda x: "'" + str(x) + "',")
print(id)
I'm trying to run parallel threads in a spark job. This works without a hitch when I run the python script from the cli, but my understanding is that is not really capitalizing on the EMR cluster parallel processing benefits. It does not actually save the data when I run as a spark job. I'm not even sure it's creating the spark dataframe when I run it as a spark job.
I also tried doing using map instead of doing the parallel threads, but couldn't get that to work either.
If I can't get the parallelism to work as a spark job, it seems like I might as well just run it on a single ec2 instance with the parallel threads.
So the basic logic is this -
Create spark context at top of script
Inside a class - Pull list of files to process from an SQS queue
Loop over list of files with the following method
# this is run for 10 blocks of 10 files each across the EMR cluster in parallel
def parquet_driver(self):
max_threads = 20
futures=[]
pool = ThreadPoolExecutor(max_threads)
i = 0
total_files_processed = 0
while total_files_processed <= len(self.master_file_list):
while i < max_threads:
print('Processing %s' % self.master_file_list[i])
futures.append(pool.submit(self.convert_to_parquet,
self.master_file_list[i]))
i += 1
for x in as_completed(futures):
pass
# add in i number of files to the total
total_files_processed += i
Notice this is passing a file to a method called "convert_to_parquet".
def convert_to_parquet(self, file):
log_file_name = file.split(':')[2].replace('.dat', '.log')
logger = Logger(log_file_name).get()
try:
bucket = s3.Bucket(file.split(':')[0])
file_name = file.split(':')[2]
file_obj = bucket.Object(file.split(':')[1] + '/' + file.split(':')[2])
partition_key = file.split(':')[2].split('.')[2]
target_table = file.split(':')[2].split('.')[1]
receipt_handle = file.split(':')[3]
file_contents = file_obj.get()["Body"].read()
if 'al1' not in file.split(':')[2]:
logger.debug('Record type = %s, deleting from queue and returning ..' % target_table)
else:
logger.debug('Working on %s..' % target_table)
app_name = file
#sc = SparkContext(appName=app_name)
print('Reading the following file from s3: %s' % file_name)
print('Found the following file contents on s3: %s' % file_contents)
rdd = sc.parallelize(file_contents.split('\n')).map(lambda line: line.split(','))
# rdd = sc.textFile(csv_file).map(lambda line: line.split(','))
# pd.read_csv(csv_file)
sqlContext = sql.SQLContext(sc)
if hasattr(rdd, "toDF"):
df = rdd.toDF()
else:
spark = SparkSession
df = rdd.toDF()
logger.debug("Partitioning data to: {0}".format(partition_key))
# Go to redshift and get the data definition
metadata = self.build_df_definition('al1')
if 'cycle_date' in metadata['columns']:
metadata['columns'].remove('cycle_date')
if 'log_timestamp' in metadata['columns']:
metadata['columns'].remove('log_timestamp')
cols = metadata['columns']
data_types = metadata['data_types']
for idx in range(0,len(cols)):
col_str = '_' + str(int(idx) + 1)
df_field_value = regexp_replace(df[col_str], '"', '')
df = df.withColumn(cols[idx],df_field_value.cast(data_types[idx]))
df = df.withColumn("cycle_date",lit(partition_key))
# this field will be pushed to the sqs queue
df = df.withColumn("log_timestamp",lit(self.log_timestamp))
full_cols = cols
full_cols.append('cycle_date')
full_cols.append('log_timestamp')
print(full_cols)
ref_df = df.select(full_cols)
ref_df.show()
partitionby=['year','month','day']
output='/opt/data/' + '/' + target_table
s3_loc = 's3://<bucket>/<prefix>/' + target_table
codec='snappy'
ref_df.write.partitionBy(['cycle_date']).format("parquet").save(s3_loc, mode="append")
#sc.stop()
except Exception as e:
logger.debug(e)
traceback.print_exc()
open("/opt/logs/dump.log","w").write(traceback.print_exc())
exit()
else:
# Delete received message from queue
sqs.delete_message(
QueueUrl=self.queue_url,
ReceiptHandle=receipt_handle
)
logger.debug('Received and deleted file: %s' % file)
Thanks in advance for advice on this problem...
I am trying to create a python script to import a set of CSVs into a mysql database.
Each CSV filename matches the destination table. The first row of each CSV matches the fields of the table. Each CSV / table has a different number of fields, field names, etc.
The problem I am having is with this line (full code below)
ins = table_name.insert().values(temp_variable_name)
where I want to dynamically update the destination table (table_name) and the insert command (temp_variable_name).
So when reading the labels.csv file, this should produce
ins = labels.insert().values(id_label=d[0], label_name=d[1])
and when reading the company.csv file, this should produce
ins = company.insert().values(id_company=d[0], company_name=d[1], ticker=d[2])
The problem is if I generate a string,
temp_variable_name = 'id_company=d[0], company_name=d[1], ticker=d[2]'
I end up getting a 'str' object has no attribute 'items' error.
Is there any way to dynamically generate an insert command for an SQL statement?
Portion of the script below:
# files list contains a list of all of the files in the directory
# we read in CSVs, isolate the first row to determine table field names
# the rest of the data should then be imported into the table with the corresponding name as the CSV
for f in files:
if '.csv' in f :
# read in each CSV file
# these are a Class / Function I've set up to read files
x = Read_Files()
data = x.read_file_lines_strip(path, f)
temp = data[0].replace('"','') # get rid of quotation marks from the data
table_header_list = temp.split('|') # get the first row, which is the table field names
variable_name ='' # this is used to construct the insert into table string
for x in xrange (0, len(table_header_list)) :
if x == 0 :
variable_name = variable_name + table_header_list[0] + '=d[0]'
elif x == len(table_header_list) :
variable_name = variable_name + table_header_list[x] + '=d[' + str(x) + ']'
else :
variable_name = variable_name + ', ' + table_header_list[x] + '=d[' + str(x) + ']'
table_name = f.replace('.csv','') # remove the .csv from filename to isolate the file name, which is the same as table name
# data from file
for data_line in data[1:] :
data_line = data_line.replace('"', '') # remove quotation marks
d = data_line.split('|') # split the line which is delimited by a |
# used to construct the final insert string
for x in xrange(0, len(table_header_list)) :
if x == 0 :
temp_variable_name = variable_name.replace('d[0]', d[0])
else :
temp_variable_name = temp_variable_name.replace('d[' + str(x) + ']', d[x])
try:
# table name is the table to insert into, via the CSV filename
# temp_variable_name is the insert string, such as 'id_company=d[0], company_name=d[1], ticker=d[2]'
ins = table_name.insert().values(temp_variable_name)
result = conn.execute(ins)
except Exception, e :
print 'error : ' + str(e)
You can do this with Insert objects and the csv module which makes it easy with the DictReader class. Here's an example for the company table:
import csv
from sqlalchemy import create_engine
from sqlalchemy.sql import table, column
NULL_FIELD_VALUE = r'\N'
DB_CONNECT = 'sqlite:///company.db'
engine = create_engine(DB_CONNECT, echo=True)
conn = engine.connect()
with open('company.csv') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
insert_table = table('company',
*[column(field) for field in reader.fieldnames])
insert_dict = [{k: None if v == NULL_FIELD_VALUE else v
for k,v in row.items()}
for row in reader]
conn.execute(insert_table.insert(), insert_dict)
I am using the mysql connector for Python and I'm trying to run the following SQL statement via Python (Windows) - It's a .csv file:
sql1 = ('SET GLOBAL local_infile = "ON";')
cursor.execute(sql1)
sql2 = ('LOAD DATA LOCAL INFILE "' + path[1:-1] + '" INTO TABLE mytable COLUMNS TERMINATED BY "," LINES TERMINATED BY "\\r\\n" (COL0, COL1, COL2, COL3, COL4, COL5, COL6) SET COL7 = "'some_data'";')
cursor.execute(sql2)
but when I try to execute I receive the following exception:
1148 (42000): The used command is not allowed with this MySQL version
If I try to execute LOAD DATA LOCAL INFILE on mysql console, everything runs fine.
Load Data Infile is disabled by default with Connector/Python
while creating the connection set LOCAL_FILES client flag like this:
from mysql.connector.constants import ClientFlag
conn = mysql.connector.connect(...., client_flags=[ClientFlag.LOCAL_FILES])
There are a lot of security issues with LOAD DATA, so the server is really picky. Are you logging in to localhost, not the public IP of the server? Often one IP will be granted LOAD DATA, but the other won't.
See the fine manual
You could iterate through each line of the file, inserting each as a row. This would be easy since you already mentioned each column is delineated by , and each row is delineated by newlines.
For example, assuming your table mytable had 8 string columns, (COL0 to COL7):
input_file = open(path[1:-1], 'r')
#Loop through the lines of the input file, inserting each as a row in mytable
for line_of_input_file in input_file:
values_from_file = line_of_input_file.split(',', 1) #get the columns from the line read from the file
if(len(values_from_file) == 7): #ensure that 7 columns accounted for on this line of the file
sql_insert_row = "INSERT INTO mytable VALUES (" + values_from_file[0] + "," + values_from_file[1] + "," + values_from_file[2] + "," + values_from_file[3] + "," + values_from_file[4] + "," + values_from_file[5] + "," + values_from_file[6] + "," + some_data + ");"
cursor.execute(sql_insert_row)
input_file.close()
With the MySQLdb driver:
import MySQLdb
from MySQLdb.constants import CLIENT
then along with other arguments to MySQLdb.connect() , pass client_flag=CLIENT.LOCAL_FILES
Discovered by studying the source, and then trying it out.