I'm trying to run parallel threads in a spark job. This works without a hitch when I run the python script from the cli, but my understanding is that is not really capitalizing on the EMR cluster parallel processing benefits. It does not actually save the data when I run as a spark job. I'm not even sure it's creating the spark dataframe when I run it as a spark job.
I also tried doing using map instead of doing the parallel threads, but couldn't get that to work either.
If I can't get the parallelism to work as a spark job, it seems like I might as well just run it on a single ec2 instance with the parallel threads.
So the basic logic is this -
Create spark context at top of script
Inside a class - Pull list of files to process from an SQS queue
Loop over list of files with the following method
# this is run for 10 blocks of 10 files each across the EMR cluster in parallel
def parquet_driver(self):
max_threads = 20
futures=[]
pool = ThreadPoolExecutor(max_threads)
i = 0
total_files_processed = 0
while total_files_processed <= len(self.master_file_list):
while i < max_threads:
print('Processing %s' % self.master_file_list[i])
futures.append(pool.submit(self.convert_to_parquet,
self.master_file_list[i]))
i += 1
for x in as_completed(futures):
pass
# add in i number of files to the total
total_files_processed += i
Notice this is passing a file to a method called "convert_to_parquet".
def convert_to_parquet(self, file):
log_file_name = file.split(':')[2].replace('.dat', '.log')
logger = Logger(log_file_name).get()
try:
bucket = s3.Bucket(file.split(':')[0])
file_name = file.split(':')[2]
file_obj = bucket.Object(file.split(':')[1] + '/' + file.split(':')[2])
partition_key = file.split(':')[2].split('.')[2]
target_table = file.split(':')[2].split('.')[1]
receipt_handle = file.split(':')[3]
file_contents = file_obj.get()["Body"].read()
if 'al1' not in file.split(':')[2]:
logger.debug('Record type = %s, deleting from queue and returning ..' % target_table)
else:
logger.debug('Working on %s..' % target_table)
app_name = file
#sc = SparkContext(appName=app_name)
print('Reading the following file from s3: %s' % file_name)
print('Found the following file contents on s3: %s' % file_contents)
rdd = sc.parallelize(file_contents.split('\n')).map(lambda line: line.split(','))
# rdd = sc.textFile(csv_file).map(lambda line: line.split(','))
# pd.read_csv(csv_file)
sqlContext = sql.SQLContext(sc)
if hasattr(rdd, "toDF"):
df = rdd.toDF()
else:
spark = SparkSession
df = rdd.toDF()
logger.debug("Partitioning data to: {0}".format(partition_key))
# Go to redshift and get the data definition
metadata = self.build_df_definition('al1')
if 'cycle_date' in metadata['columns']:
metadata['columns'].remove('cycle_date')
if 'log_timestamp' in metadata['columns']:
metadata['columns'].remove('log_timestamp')
cols = metadata['columns']
data_types = metadata['data_types']
for idx in range(0,len(cols)):
col_str = '_' + str(int(idx) + 1)
df_field_value = regexp_replace(df[col_str], '"', '')
df = df.withColumn(cols[idx],df_field_value.cast(data_types[idx]))
df = df.withColumn("cycle_date",lit(partition_key))
# this field will be pushed to the sqs queue
df = df.withColumn("log_timestamp",lit(self.log_timestamp))
full_cols = cols
full_cols.append('cycle_date')
full_cols.append('log_timestamp')
print(full_cols)
ref_df = df.select(full_cols)
ref_df.show()
partitionby=['year','month','day']
output='/opt/data/' + '/' + target_table
s3_loc = 's3://<bucket>/<prefix>/' + target_table
codec='snappy'
ref_df.write.partitionBy(['cycle_date']).format("parquet").save(s3_loc, mode="append")
#sc.stop()
except Exception as e:
logger.debug(e)
traceback.print_exc()
open("/opt/logs/dump.log","w").write(traceback.print_exc())
exit()
else:
# Delete received message from queue
sqs.delete_message(
QueueUrl=self.queue_url,
ReceiptHandle=receipt_handle
)
logger.debug('Received and deleted file: %s' % file)
Related
I have many text files on the GCS and I want to load them into the BigQuery. I already loaded them, but it inserts fields in different columns.
Here are my code.
def string_2_dataframe(string, names, widths,audit_load_key):
io_string = StringIO(string)
dataframe = pandas.read_csv(
io_string, names=names, sep="\t", dtype=str)
dataframe = dataframe.fillna("")
dataframe = dataframe.replace("nan", "")
dataframe['audit_load_key'] = audit_load_key
print(dataframe)
return dataframe
def dataframe_2_bigquery(dataframe, table_id, job_config):
bigquery_client = bigquery.Client()
# Make an API request.
job = bigquery_client.load_table_from_dataframe(
dataframe, config[table_id])
job.result() # Wait for the job to complete.
table = bigquery_client.get_table(config[table_id])
return "Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), config[table_id])
def blob_2_bigquery(blob_name, fnames, fwidths, table_id, job_config,audit_load_key):
fs = GCSFileSystem()
source = f"{config['bucket']}/{blob_name}"
with fs.open(source, "r", encoding="utf-8", errors="ignore") as source:
count = 0
block = ""
while True:
line = source.readline()
block += line
count += 1
if count % 10000 == 0:
dataframe = string_2_dataframe(block, fnames, fwidths,audit_load_key)
result = dataframe_2_bigquery(dataframe, table_id, job_config)
block = ""
if not line:
dataframe = string_2_dataframe(block, fnames, fwidths,audit_load_key)
result = dataframe_2_bigquery(dataframe, table_id, job_config)
break
And here is my sample text file.
Please help me to solve this issue.
I am trying to export the whole database schema (around 20 GB) using postgreSQL query to create a final unique hdf5 file.
Because this size don't fit on my computers memory, I am using chuncks argument.
First I use this function to establish conection:
def make_connectstring(prefix, db, uname, passa, hostname, port):
"""return an sql connectstring"""
connectstring = prefix + "://" + uname + ":" + passa + "#" + hostname + \
":" + port + "/" + db
return connectstring
Then I created a temporary folder to save each of hdf5 file.
def query_to_hdf5(connectstring, query, verbose=False, chunksize=50000):
engine = sqlalchemy.create_engine(connectstring,
server_side_cursors=True)
# get the data to temp chunk filese
i = 0
paths_chunks = []
with tempfile.TemporaryDirectory() as td:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
path = td + "/chunk" + str(i) + ".hdf5"
df.to_hdf(path, key='data')
print(path)
if verbose:
print("wrote", path)
paths_chunks.append(path)
i+=1
connectstring = make_connectstring(prefix, db, uname, passa, hostname, port)
query = "SELECT * FROM public.zz_ges"
df = query_to_hdf5(connectstring, query)
What is the best way to merge all these files into 1 single file that represents the whole dataframe ?
I tried something like this :
df = pd.DataFrame()
print(path)
for path in paths_chunks:
df_scratch = pd.read_hdf(path)
df = pd.concat([df, df_scratch])
if verbose:
print("read", path)
However, the memory goes up very fast. I need something that could be more efficient.
Update:
def make_connectstring(prefix, db, uname, passa, hostname, port):
"""return an sql connectstring"""
connectstring = prefix + "://" + uname + ":" + passa + "#" + hostname + \
":" + port + "/" + db
return connectstring
def query_to_df(connectstring, query, verbose=False, chunksize=50000):
engine = sqlalchemy.create_engine(connectstring,
server_side_cursors=True)
# get the data to temp chunk filese
with pd.HDFStore('output.h5', 'w') as store:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
store.append('data', df)
I'd suggest using a HDFStore directly, that way you can append chunks as you get them from the database, something like:
with pd.HDFStore('output.h5', 'w') as store:
for df in pd.read_sql_query(sql=query, con=engine, chunksize=chunksize):
store.append('data', df)
this is based around your existing code so isn't complete, let me know if it isn't clear
note I'm opening the store in w mode so it'll delete the file every time. otherwise append will just keep adding the same rows to the end of the table. alternatively you could remove the key first
when you open the store you also get lots of options like compression to use but it doesn't seem to be well documented, help(pd.HDFStore) describes complevel and complib for me
I have a zip file with many .dat files. In each of them, I am aiming to apply some function that outputs two results, and I want to save the result of that function and the time that takes into three lists. The order matters. Here is the code to do it with no parallel computing:
result_1 = []
result_2 = []
runtimes = []
args_function = 'some args' # Always the same
with zipfile.ZipFile(zip_file, "r") as zip_ref:
for name in sorted(zip_ref.namelist()):
data = np.loadtxt(zip_ref.open(name))
start_time = time.time()
a, b = function(data, args_function)
runtimes.append(time.time() - start_time)
result_1.append(a)
result_2.append(b)
This seems to me embarrassingly parallel, so I did:
result_1 = []
result_2 = []
runtimes = []
args_function = 'some args' # Always the same
def compute_paralel(name, zip_ref):
data = np.loadtxt(zip_ref.open(name))
start_time = time.time()
a, b = function(data, args_function)
runtimes.append(time.time() - start_time)
result_1.append(a)
result_2.append(b)
with zipfile.ZipFile(zip_file, "r") as zip_ref:
Parallel(n_jobs=-1)(delayed(compute_paralel)(name, zip_ref) for name in sorted(zip_ref.namelist()))
But rises me the following error: pickle.PicklingError: Could not pickle the task to send it to the workers.. Therefore I'm not really sure what to do... Any ideas?
Hello to all passionate programmers out there. I need your help with my code.
My Goal: To efficiently move data from Amazon S3 to Amazon Redshift.
Basically I am moving all CSV files on my S3 to Redshift using the below code. I parse through part of the file, build a table structure and then use the copy command to load data into redshift.
'''
Created on Feb 25, 2015
#author: Siddartha.Reddy
'''
import sys
from boto.s3 import connect_to_region
from boto.s3.connection import Location
import csv
import itertools
import psycopg2
''' ARGUMENTS TO PASS '''
AWS_KEY = sys.argv[1]
AWS_SECRET_KEY = sys.argv[2]
S3_DOWNLOAD_PATH = sys.argv[3]
REDSHIFT_SCHEMA = sys.argv[4]
TABLE_NAME = sys.argv[5]
UTILS = S3_DOWNLOAD_PATH.split('/')
class UTIL():
global UTILS
def bucket_name(self):
self.BUCKET_NAME = UTILS[0]
return self.BUCKET_NAME
def path(self):
self.PATH = ''
offset = 0
for value in UTILS:
if offset == 0:
offset += 1
else:
self.PATH = self.PATH + value + '/'
return self.PATH[:-1]
def GETDATAINMEMORY():
conn = connect_to_region(Location.USWest2,aws_access_key_id = AWS_KEY,
aws_secret_access_key = AWS_SECRET_KEY,
is_secure=False,host='s3-us-west-2.amazonaws.com'
)
ut = util()
BUCKET_NAME = ut.bucket_name()
PATH = ut.path()
filelist = conn.lookup(BUCKET_NAME)
''' Fecth part of the data from S3 '''
for path in filelist:
if PATH in path.name:
DATA = path.get_contents_as_string(headers={'Range': 'bytes=%s-%s' % (0,100000000)})
return DATA
def TRAVERSEDATA():
DATA = getdatainmemory()
CREATE_TABLE_QUERY = 'CREATE TABLE ' + REDSHIFT_SCHEMA + '.' + TABLE_NAME + '( '
JUNKED_OUT = DATA[3:]
PROCESSED_DATA = JUNKED_OUT.split('\n')
CSV_DATA = csv.reader(PROCESSED_DATA,delimiter=',')
COUNTER,STRING,NUMBER = 0,0,0
COLUMN_TYPE = []
''' GET COLUMN NAMES AND COUNT '''
for line in CSV_DATA:
NUMBER_OF_COLUMNS = len(line)
COLUMN_NAMES = line
break;
''' PROCESS COLUMN NAMES '''
a = 0
for REMOVESPACE in COLUMN_NAMES:
TEMPHOLDER = REMOVESPACE.split(' ')
temp1 = ''
for x in TEMPHOLDER:
temp1 = temp1 + x
COLUMN_NAMES[a] = temp1
a = a + 1
''' GET COLUMN DATA TYPES '''
# print(NUMBER_OF_COLUMNS,COLUMN_NAMES,COUNTER)
# print(NUMBER_OF_COLUMNS)
i,j,a= 0,500,0
while COUNTER < NUMBER_OF_COLUMNS:
for COLUMN in itertools.islice(CSV_DATA,i,j+1):
if COLUMN[COUNTER].isdigit():
NUMBER = NUMBER + 1
else:
STRING = STRING + 1
if NUMBER == 501:
COLUMN_TYPE.append('INTEGER')
# print('I CAME IN')
NUMBER = 0
else:
COLUMN_TYPE.append('VARCHAR(2500)')
STRING = 0
COUNTER = COUNTER + 1
# print(COUNTER)
COUNTER = 0
''' BUILD SCHEMA '''
while COUNTER < NUMBER_OF_COLUMNS:
if COUNTER == 0:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' NOT NULL,'
else:
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY + COLUMN_NAMES[COUNTER] + ' ' + COLUMN_TYPE[COUNTER] + ' ,'
COUNTER += 1
CREATE_TABLE_QUERY = CREATE_TABLE_QUERY[:-2]+ ')'
return CREATE_TABLE_QUERY
def COPY_COMMAND():
S3_PATH = 's3://' + S3_DOWNLOAD_PATH
COPY_COMMAND = "COPY "+REDSHIFT_SCHEMA+"."+TABLE_NAME+" from '"+S3_PATH+"' credentials 'aws_access_key_id="+AWS_KEY+";aws_secret_access_key="+AWS_SECRET_KEY+"' REGION 'us-west-2' csv delimiter ',' ignoreheader as 1 TRIMBLANKS maxerror as 500"
return COPY_COMMAND
def S3TOREDSHIFT():
conn = psycopg2.connect("dbname='xxx' port='5439' user='xxx' host='xxxxxx' password='xxxxx'")
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS '+ REDSHIFT_SCHEMA + "." + TABLE_NAME)
SCHEMA = TRAVERSEDATA()
print(SCHEMA)
cursor.execute(SCHEMA)
COPY = COPY_COMMAND()
print(COPY)
cursor.execute(COPY)
conn.commit()
S3TOREDSHIFT()
Current Challenges:
Challenges with creating the table structure :
Field lengths : Right now I am just hardcoding the VARCHAR fields to 2500. All my files are > 30gb and parsing through the whole file to calculate length of a field takes lot of processing time.
Determining if a column is null: I am simply hard coding the first column to NOT NULL using the COUNTER variable. ( All my files have ID as first column ). Would like to know if there is a better way of doing it.
Is there any data structure I can use? I am always interested in learning new ways to improve the performance, if you guys have any suggestions please feel free to comment.
I am creating my own bootloader for an ATXmega128A4U. To use the bootloader I want to transform the ELF-file of the firmware into a memory map used in the the ATXmega.
For that I use python and the modul "pyelftools". The documentation of it is poor and so I run into a problem: I do not know what information I can use to get the address, offset etc. from the data at the sections.
My goal is to create a bytearray, copy the data/code into it and transfer it to the bootlaoder. Below is my code:
import sys
# If pyelftools is not installed, the example can also run from the root or
# examples/ dir of the source distribution.
sys.path[0:0] = ['.', '..']
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
# 128k flash for the ATXmega128a4u
flashsize = 128 * 1024
def process_file(filename):
with open(filename, 'rb') as f:
# get the data
elffile = ELFFile(f)
dataSec = elffile.get_section_by_name(b'.data')
textSec = elffile.get_section_by_name(b'.text')
# prepare the memory
flashMemory = bytearray(flashsize)
# the data section
startAddr = dataSec.header.sh_offset
am = dataSec.header.sh_size
i = 0
while i < am:
val = dataSec.stream.read(1)
flashMemory[startAddr] = val[0]
startAddr += 1
i += 1
# the text section
startAddr = textSec.header.sh_offset
am = textSec.header.sh_size
i = 0
while i < am:
print(str(startAddr) + ' : ' + str(i))
val = textSec.stream.read(1)
flashMemory[startAddr] = val[0]
startAddr += 1
i += 1
print('finished')
if __name__ == '__main__':
process_file('firmware.elf')
Hope someone can tell me how to solve this problem.
I manged to solve the problem.
don't read the data manualy from the stream by "textSec.stream.read" use "textSec.data()" instead. Internaly (see "sections.py") a seek operation in the file is done. Afterwards the data is read. The result will be the valid data chunk.
The following code reads the code(text) section of a atxmega firmware and copies it into a bytearray which has the layout of the flash of an atxmega128a4u device.
#vlas_tepesch: the hex conversation is not needed and the the 64k pitfall is avoided.
sys.path[0:0] = ['.', '..']
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
# 128k flash for the ATXmega128a4u
flashsize = 128 * 1024
def __printSectionInfo (s):
print ('[{nr}] {name} {type} {addr} {offs} {size}'.format(
nr = s.header['sh_name'],
name = s.name,
type = s.header['sh_type'],
addr = s.header['sh_addr'],
offs = s.header['sh_offset'],
size = s.header['sh_size']
)
)
def process_file(filename):
print('In file: ' + filename)
with open(filename, 'rb') as f:
# get the data
elffile = ELFFile(f)
print ('sections:')
for s in elffile.iter_sections():
__printSectionInfo(s)
print ('get the code from the .text section')
textSec = elffile.get_section_by_name(b'.text')
# prepare the memory
flashMemory = bytearray(flashsize)
# the text section
startAddr = textSec.header['sh_addr']
val = textSec.data()
flashMemory[startAddr:startAddr+len(val)] = val
# print memory
print('finished')
if __name__ == '__main__':
process_file('firmware.elf')
Tanks for the comments!