SQLAlchemy performance difference between engine and connection executio

SQLAlchemy performance difference between engine and connection executio - python

I'm experiencing a strange behaviour with SQLALchemy from the performance viewpoint.
I've created this simple test script:
from sqlalchemy import *
from time import clock, time
from datetime import datetime
import os
s = "Start: "+str(datetime.now())+"\n"
engine = create_engine("sqlite:///C:/mydb.db?check_same_thread=False")
print engine
ids = ('ecab910b-261b-11e9-9479-5800e3d291a6',
'ecab90f0-261b-11e9-8801-5800e3d291a6',
'ecab90f1-261b-11e9-bcf7-5800e3d291a6',
'ecab90f2-261b-11e9-a7c1-5800e3d291a6',
'ecab90f3-261b-11e9-b8d0-5800e3d291a6',
'ecab90f4-261b-11e9-8a61-5800e3d291a6',
'ecab90f5-261b-11e9-a11a-5800e3d291a6',
'ecab90f6-261b-11e9-9a55-5800e3d291a6',
'ecab90f7-261b-11e9-b16e-5800e3d291a6',
'ecab90f8-261b-11e9-82a6-5800e3d291a6',
'ecab90f9-261b-11e9-ad9d-5800e3d291a6',
'ecab90fa-261b-11e9-ae23-5800e3d291a6',
'ecab90fb-261b-11e9-93a5-5800e3d291a6',
'ecab90fc-261b-11e9-b963-5800e3d291a6',
'ecab90fd-261b-11e9-b875-5800e3d291a6',
'ecab90fe-261b-11e9-ae90-5800e3d291a6',
'ecab90ff-261b-11e9-8e07-5800e3d291a6',
'ecab9101-261b-11e9-8808-5800e3d291a6',
'ecab9102-261b-11e9-82e5-5800e3d291a6',
'ecab9103-261b-11e9-80e7-5800e3d291a6',
'ecab9108-261b-11e9-ad8c-5800e3d291a6',
'ecab9109-261b-11e9-80d0-5800e3d291a6',
'ecab910a-261b-11e9-9c5d-5800e3d291a6',
'ecab910d-261b-11e9-86f7-5800e3d291a6',
'ecab910e-261b-11e9-9322-5800e3d291a6',
'ecab910f-261b-11e9-b454-5800e3d291a6',
'ecab9114-261b-11e9-b1ec-5800e3d291a6',
'ecab911b-261b-11e9-b40c-5800e3d291a6')
metadata = MetaData(engine)
tab = Table("mytab", metadata, autoload=True)
q = update(tab, tab.c.id.in_(ids), {"status":5})
conn = engine.connect()
el = clock()
conn.execute(q)
s += "Elapsed (from connection) = %f\n" %(clock()-el)
el = clock()
q.execute()
s += "Elapsed (from engine) = %f\n" %(clock()-el)
print s
In the script I'm simply comparing the execution time when the query is executed from the engine and from the connection. Here is the output:
Engine(sqlite:///C:/mydb.db?check_same_thread=False)
Start: 2019-02-01 13:41:28.771000
Elapsed (from connection) = 0.007447
Elapsed (from engine) = 0.576755
Can anyone help me to understand why there is such a huge difference in the execution and what should be the right apporach to use SQLAlchemy with non-ORM queries?
Thanks in advance.

Related

Issue with Multiprocessing script in terminal

When I tried to run my multiprocessing script in terminal, I keep getting this error message:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
This is my script:
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
def test(first_evnt, last_evnt):
PML_loan_Query = "select b.id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt BETWEEN ? AND ?"
PML.execute(PML_loan_Query,(first_evnt, last_evnt))
loan_records = PML.fetchall()
df = pd.DataFrame()
for x in loan_records:
# Populating the ODS table
#borr_query = "SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(100)) AS cd_idx, CAST(rate_curr_int AS INT) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS INT) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS INT) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS INT) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
borr_query = 'SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(10)) AS cd_idx, CAST(rate_curr_int AS VARCHAR(10)) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS VARCHAR(10)) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS VARCHAR(10)) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS VARCHAR(10)) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus IN (?)'
#borr_query = "SELECT DISTINCT nbr_aus FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
ODS.execute(borr_query, x)
#ODS.execute(ODS_list)
ODS_records = ODS.fetchall()
ODS_records = df.append(pd.DataFrame(ODS_records, columns = ['nbr_aus', 'cd_idx', 'rate_curr_int', 'rate_gr_mrtg_mrgn', 'rate_loln_max_cap', 'rate_perdc_cap']))
return ODS_records
if __name__ == '__main__':
freeze_support()
pw = getpass.getpass(prompt="Password", stream=False)
# establishing database to the ODS database
ODS = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs351.fhlmc.com:50001/DB2QLTY', ['f408195', pw],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the ODS database
ODS = ODS.cursor()
# creating the password needed to establish PML database connection
pw_2 = getpass.getpass(prompt="Password", stream=False)
# establishing database to the PML database
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver','jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2', ['f408195', pw_2],'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the PML database
PML = PML.cursor()
first_evnt = 155643917
last_evnt = 155684481
p = Pool()
result = p.map(test, [first_evnt, last_evnt])
print(result)
p.close()
p.join()

Inserting into a Cassandra DB is slow even with execute_concurrent()

I am trying to insert a pandas dataframe into cassandra. I am using the execute_concurrent, but I don't see any improvement. It is taking almost 5s per row insertions. There are 14k rows so at this rate it will take more than 15 hours. I have 12 GB RAM with 2 CPU cores. How fast can I run this operation? I've tried with different concurrency numbers but without any success. Following is my code-:
from flask import session
import yaml
import pandas as pd
import argparse
from get_data import read_params
import cassandra
from cassandra.concurrent import execute_concurrent_with_args, execute_concurrent
from cassandra.cluster import Cluster, ExecutionProfile
from cassandra.auth import PlainTextAuthProvider
import sys
import time
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.3+
count = len(it)
def show(j):
x = int(size*j/count)
print("{}[{}{}] {}/{}".format(prefix, u"█"*x, "."*(size-x), j, count),
end='\r', file=out, flush=True)
show(0)
for i, item in enumerate(it):
yield item
show(i+1)
print("\n", flush=True, file=out)
def cassandraDBLoad(config_path):
try:
config = read_params(config_path)
execution_profile = ExecutionProfile(request_timeout=10)
cassandra_config = {'secure_connect_bundle': "path"}
auth_provider = PlainTextAuthProvider(
"client_id",
"client_secret"
)
cluster = Cluster(cloud=cassandra_config, auth_provider=auth_provider)
session = cluster.connect()
session.default_timeout = None
connect_db = session.execute("select release_version from system.local")
set_keyspace = session.set_keyspace("Keyspace Name")
table_ = "big_mart"
define_columns = "Item_Identifier varchar PRIMARY KEY, Item_Weight varchar, Item_Fat_Content varchar, Item_Visibility varchar, Item_Type varchar, Item_MRP varchar, Outlet_Identifier varchar, Outlet_Establishment_Year varchar, Outlet_Size varchar, Outlet_Location_type varchar, Outlet_Type varchar, Item_Outlet_Sales varchar, source varchar"
drop_table = f"DROP TABLE IF EXISTS {table_}"
drop_result = session.execute(drop_table)
create_table = f"CREATE TABLE {table_}({define_columns});"
table_result = session.execute(create_table)
train = pd.read_csv("train_source")
test = pd.read_csv("test_source")
#Combine test and train into one file
train['source']='train'
test['source']='test'
df = pd.concat([train, test],ignore_index=True)
df = df.fillna('NA')
columns = "Item_Identifier, Item_Weight, Item_Fat_Content, Item_Visibility, Item_Type, Item_MRP, Outlet_Identifier, Outlet_Establishment_Year, Outlet_Size, Outlet_Location_Type, Outlet_Type, Item_Outlet_Sales, source"
insert_qry = f"INSERT INTO {table_}({columns}) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)"
statement = session.prepare(insert_qry)
parameters = [
(str(df.iat[i,0]), str(df.iat[i,1]), str(df.iat[i,2]), str(df.iat[i,3]),
str(df.iat[i,4]), str(df.iat[i,5]), str(df.iat[i,6]), str(df.iat[i,7]),
str(df.iat[i,8]), str(df.iat[i,9]), str(df.iat[i,10]), str(df.iat[i,11]),
str(df.iat[i,12]))
for i in range(len(df))]
for i in progressbar(range(len(df)), "Computing: ", 40):
time.sleep(0.1)
execute_concurrent_with_args(
session,
statement,
parameters,
concurrency=500
)
session.execute(batch)
except Exception as e:
raise Exception("(cassandraDBLoad): Something went wrong in the CassandraDB Load operations\n" + str(e))
csv files link - https://drive.google.com/drive/folders/1O03lNTMfSwhUKG61zOs7fNxXIRe44GRp?usp=sharing

Even with concurrent asynchronous requests (execute_concurrent()), it will still be bottlenecked on the client side because there is only so much a single client process can do even when it's multi-threaded.
If you want to maximise the throughput of your cluster, we recommend scaling your app horizontally and run multiple instances (processes). This can be easily achieved with the Python driver using the multiprocessing module. For details, see the Python driver Performance Notes.
Finally, if your goal is to simply bulk-load data to your Cassandra DB, it makes no sense to re-invent the wheel by writing your own application when there are free, open-source tools that exist specifically for this use case.
You can use the DataStax Bulk Loader tool (DSBulk) to bulk load data in CSV format to a Cassandra table. Here are some references with examples to help you get started quickly:
Blog - DSBulk Intro + Loading data
Blog - More DSBulk Loading examples
Blog - Counting records with DSBulk
Docs - Loading data examples
DSBulk is open-source so it's free to use. Cheers!

Fastest way to insert many rows of data?

Every 4 seconds, I have to store 32,000 rows of data. Each of these rows consists of one time stamp value and 464 double precision values. The column name for the time stamp is time and the column name for the precision values increase sequentially as channel1, channel2, ..., and channel 464.
I establish a connection as follows:
CONNECTION = f"postgres://{username}:{password}#{host}:{port}/{dbname}"#?sslmode=require"
self.TimescaleDB_Client = psycopg2.connect(CONNECTION)
I then verify the TimescaleDB extension with the following:
def verifyTimeScaleInstall(self):
try:
sql_query = "CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;"
cur = self.TimescaleDB_Client.cursor()
cur.execute(sql_query)
cur.close()
self.TimescaleDB_Client.commit()
except:
self.timescaleLogger.error("An error occurred in verifyTimeScaleInstall")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
I then create a hyptertable for my data with the following:
def createRAWDataTable(self):
try:
cur = self.TimescaleDB_Client.cursor()
self.query_create_raw_data_table = None
for channel in range(self.num_channel) :
channel = channel + 1
if self.query_create_raw_data_table is None:
self.query_create_raw_data_table = f"CREATE TABLE IF NOT EXISTS raw_data (time TIMESTAMPTZ NOT NULL, channel{channel} REAL"
else:
self.query_create_raw_data_table = self.query_create_raw_data_table + f", channel{channel} REAL"
self.query_create_raw_data_table = self.query_create_raw_data_table + ");"
self.query_create_raw_data_hypertable = "SELECT create_hypertable('raw_data', 'time');"
cur.execute(self.query_create_raw_data_table)
cur.execute(self.query_create_raw_data_hypertable)
self.TimescaleDB_Client.commit()
cur.close()
except:
self.timescaleLogger.error("An error occurred in createRAWDataTable")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
I then insert the data into the hypertable using the following:
def insertRAWData(self, seconds):
try:
insert_start_time = datetime.now(pytz.timezone("MST"))
current_time = insert_start_time
num_iterations = seconds * self.fs
time_increment = timedelta(seconds=1/self.fs)
raw_data_query = self.query_insert_raw_data
dtype = "float32"
matrix = np.random.rand(self.fs*seconds,self.num_channel).astype(dtype)
cur = self.TimescaleDB_Client.cursor()
data = list()
for iteration in range(num_iterations):
raw_data_row = matrix[iteration,:].tolist() #Select a particular row and all columns
time_string = current_time.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
raw_data_values = (time_string,)+tuple(raw_data_row)
data.append(raw_data_values)
current_time = current_time + time_increment
start_time = time.perf_counter()
psycopg2.extras.execute_values(
cur, raw_data_query, data, template=None, page_size=100
)
print(time.perf_counter() - start_time)
self.TimescaleDB_Client.commit()
cur.close()
except:
self.timescaleLogger.error("An error occurred in insertRAWData")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
The SQL Query String that I am referencing in the above code is obtained from the following:
def getRAWData_Query(self):
try:
self.query_insert_raw_data = None
for channel in range(self.num_channel):
channel = channel + 1
if self.query_insert_raw_data is None:
self.query_insert_raw_data = f"INSERT INTO raw_data (time, channel{channel}"
else:
self.query_insert_raw_data = self.query_insert_raw_data + f", channel{channel}"
self.query_insert_raw_data = self.query_insert_raw_data + ") VALUES %s;"
return self.query_insert_raw_data
except:
self.timescaleLogger.error("An error occurred in insertRAWData_Query")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
As you can see, I am using psycopg2.extras.execute_values() to insert the values. To my understanding, this is one of the fastest ways to insert data. However, it takes about 80 seconds for me to insert this data. It is on quite a beafy system with 12 cores/24 threads, SSDs, and 256GB of RAM. Can this be done faster? It just seems quite slow.
I would like to use TimescaleDB and am evaluating its performance. But I am looking to write within 2 seconds or so for it to be acceptable.
Edit I have tried to use pandas to perform the insert, but it took longer, at about 117 seconds. The following is the function that I used.
def insertRAWData_Pandas(self, seconds):
try:
insert_start_time = datetime.now(pytz.timezone("MST"))
current_time = insert_start_time
num_iterations = seconds * self.fs
time_increment = timedelta(seconds=1/self.fs)
raw_data_query = self.query_insert_raw_data
dtype = "float32"
matrix = np.random.rand(self.fs*seconds,self.num_channel).astype(dtype)
pd_df_dict = {}
pd_df_dict["time"] = list()
for iteration in range(num_iterations):
time_string = current_time.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
pd_df_dict["time"].append(time_string)
current_time = current_time + time_increment
for channel in range(self.num_channel):
pd_df_dict[f"channel{channel}"] = matrix[:,channel].tolist()
start_time = time.perf_counter()
pd_df = pd.DataFrame(pd_df_dict)
pd_df.to_sql('raw_data', self.engine, if_exists='append')
print(time.perf_counter() - start_time)
except:
self.timescaleLogger.error("An error occurred in insertRAWData_Pandas")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
edit I have tried to use CopyManager and it appears to be producing the best results at around 74 seconds. Still not what I was after however.
def insertRAWData_PGCOPY(self, seconds):
try:
insert_start_time = datetime.now(pytz.timezone("MST"))
current_time = insert_start_time
num_iterations = seconds * self.fs
time_increment = timedelta(seconds=1/self.fs)
dtype = "float32"
matrix = np.random.rand(num_iterations,self.num_channel).astype(dtype)
data = list()
for iteration in range(num_iterations):
raw_data_row = matrix[iteration,:].tolist() #Select a particular row and all columns
#time_string = current_time.strftime("%Y-%m-%d %H:%M:%S.%f %Z")
raw_data_values = (current_time,)+tuple(raw_data_row)
data.append(raw_data_values)
current_time = current_time + time_increment
channelList = list()
for channel in range(self.num_channel):
channel = channel + 1
channelString = f"channel{channel}"
channelList.append(channelString)
channelList.insert(0,"time")
cols = tuple(channelList)
start_time = time.perf_counter()
mgr = CopyManager(self.TimescaleDB_Client, 'raw_data', cols)
mgr.copy(data)
self.TimescaleDB_Client.commit()
print(time.perf_counter() - start_time)
except:
self.timescaleLogger.error("An error occurred in insertRAWData_PGCOPY")
tb = traceback.format_exc()
self.timescaleLogger.exception(tb)
return False
I tried to modify the following values in postgresql.conf. There wasn't a noticeable performance improvement.
wal_level = minimal
fsync = off
synchronous_commit = off
wal_writer_delay = 2000ms
commit_delay = 100000
I have tried to modify the chunk size according to one of the below comments using the following in my createRawDataTable() function. However, there wasn't an improvement in the insert times. Perhaps this was also expectable given that I haven't been accumulating data. The data in the database has only been a few samples, perhaps at most 1 minute worth over the course of my testing.
self.query_create_raw_data_hypertable = "SELECT create_hypertable('raw_data', 'time', chunk_time_interval => INTERVAL '3 day',if_not_exists => TRUE);"
Edit For anyone reading this, I was able to pickle and insert an 32000x464 float32 numpy matrix in about 0.5 seconds for MongoDB, which is what my final solution is. Perhaps MongoDB just does better with this workload in this case.

I have a two initial suggestions that may help with overall performance.
The default hypertable you are creating will "chunk" your data by 7 day periods (this means each chunk will hold around 4,838,400,000 rows of data given your parameters). Since your data is so granular, you may want to use a different chunk size. Check out the docs here for info on the optional chunk_time_interval argument. Changing the chuck size should help with inserting and querying speed, it also will give you better performance in compression if needed later on.
As the individuals above stated, playing around with batch inserts should also help. If you haven't checked out this stock data tutorial I would highly recommend it. Using pgcopy and it's function CopyManager could help with inserting df objects more quickly.
Hopefully, some of this information can be helpful to your situation!
disclosure: I am part of the Timescale team 😊

You can use sqlachemy library to do it and also calibrate the chunksize while you are at it.
Append the data should possibly less than 74 seconds since I perform similar kind of insertion and it takes me about 40 odd seconds.
Another possibility is to use the pandas.DataFrame.to_sql with method=callable. It will increase the performance drastically.
in comparison to just to_sql (150s) or to_sql with method = multi (196s), the callable method did the job in just 14s.
Although a comparative summary for different methods would be best described with the image

One of the fastest ways is to
first create a pandas data frame of your data that you want to insert into the DB
then use the data frame to bulk-insert your data into the DB
here is a way you can do it: How to write data frame to postgres?

Python Jupyter Notebook won't run my code, keeps reconnecting

How come this piece of code does not run properly on Jupyter Notebook.
It keeps reconnecting without any result. I try to make a database and scrape data as fast as possible from a webserver. I use threads to speed up the process and iterate over multiple url's (every different url represent a different day).
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
Would like to know how I can solve this and make it work

python cassandra driver same insert performance as copy

I'm trying to use Python async with Cassandra to see if I can write records to Cassandra faster than the CQL COPY command.
My python code looks like this:
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('test')
with open('dataImport.txt') as f:
for line in f:
query = SimpleStatement (
"INSERT INTO tstTable (id, accts, info) VALUES (%s) " %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
but its giving me the same performance as the COPY command...around 2,700 rows/sec....should it be faster with async?
Do I need to use multithreading in python? Just reading about it but not sure how it fits into this...
EDIT:
so I found something online that i'm trying to modify but can't get to quite work...I have this so far..also I split the file into 3 file into /Data/toImport/ dir:
import multiprocessing
import time
import os
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('test')
def mp_worker(inputArg):
with open(inputArg[0]) as f:
for line in f:
query = SimpleStatement (
"INSERT INTO CustInfo (cust_id, accts, offers) values (%s)" %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
def mp_handler(inputData, nThreads = 8):
p = multiprocessing.Pool(nThreads)
p.map(mp_worker, inputData, chunksize=1)
p.close()
p.join()
if __name__ == '__main__':
temp_in_data = file_list
start = time.time()
in_dir = '/Data/toImport/'
N_Proc = 8
file_data = [(in_dir) for i in temp_in_data]
print '----------------------------------Start Working!!!!-----------------------------'
print 'Number of Processes using: %d' %N_Proc
mp_handler(file_data, N_Proc)
end = time.time()
time_elapsed = end - start
print '----------------------------------All Done!!!!-----------------------------'
print "Time elapsed: {} seconds".format(time_elapsed)
but get this error:
Traceback (most recent call last):
File "multiCass.py", line 27, in <module>
temp_in_data = file_list
NameError: name 'file_list' is not defined

This post A Multiprocessing Example for Improved Bulk Data Throughput provides all the details needed to improve the performance of bulk data ingestion. Basically there are 3 mechanisms and additional tuning can be done based on your use-case & hw:
single process (that's the case in your example)
multi-processing single queries
multi-processing concurrent queries
Size of batches and concurrency are the variables you'll have to play with yourself.

got it working like this:
import multiprocessing
import time
import os
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
from cassandra.query import SimpleStatement
def mp_worker(inputArg):
cluster = Cluster(['1.2.1.4'])
session = cluster.connect('poc')
with open(inputArg[0]) as f:
for line in f:
query = SimpleStatement (
"INSERT INTO testTable (cust_id, accts, offers) values (%s)" %(line),
consistency_level=ConsistencyLevel.ONE)
session.execute_async (query)
def mp_handler(inputData, nThreads = 8):
p = multiprocessing.Pool(nThreads)
p.map(mp_worker, inputData, chunksize=1)
p.close()
p.join()
if __name__ == '__main__':
temp_in_data = ['/toImport/part-00000', '/toImport/part-00001', '/toImport/part-00002']
start = time.time()
N_Proc = 3
file_data = [(i,) for i in temp_in_data]
print '----------------------------------Start Working!!!!-----------------------------'
print 'Number of Processes using: %d' %N_Proc
mp_handler(file_data, N_Proc)
end = time.time()
time_elapsed = end - start
print '----------------------------------All Done!!!!-----------------------------'
print "Time elapsed: {} seconds".format(time_elapsed)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

SQLAlchemy performance difference between engine and connection executio - python

Related

Issue with Multiprocessing script in terminal

Inserting into a Cassandra DB is slow even with execute_concurrent()

Fastest way to insert many rows of data?

Python Jupyter Notebook won't run my code, keeps reconnecting

python cassandra driver same insert performance as copy

Categories

Resources