How to improve runtime on csv function python

How to improve runtime on csv function python - python

I have a running cod that runs but it takes a LONG time to run. By viewing my code, is there any way to improve the speed? I am thinking to loop the query and then put it all in a dictionary and then writer it to a CSV file, but i am having issues. Here is my code:
#Import the appropriate models and functions needed for our script
from cbapi.response import *
import logging
import csv
#Connect to our CB Server
conn = CbResponseAPI()
#Sample Query
q = "ipAddress:192.0.0.5"
#Initial our query
process_query = conn.select(Process).where(q).group_by("id")
#Set your path
my_path='/Users/path/123.csv'
#all object properties for event
objects=['childproc_count',
'cmdline',
'comms_ip',
'crossproc_count',
'filemod_count',
'filtering_known_dlls',
'group',
'host_type',
'hostname',
'id',
'interface_ip',
'last_server_update',
'last_update',
'modload_count',
'netconn_count',
'os_type',
'parent_id',
'parent_md5',
'parent_name',
'parent_pid',
'parent_unique_id',
'path',
'process_md5',
'process_name',
'process_pid',
'process_sha256',
'processblock_count',
'regmod_count',
'segment_id',
'sensor_id',
'start',
'terminated',
'unique_id',
'username']
with open(my_path, 'w', newline='') as file:
header=objects #add columns
writer = csv.DictWriter(file, fieldnames=header)
writer.writeheader()
for x in process_query:
dd={'id': x.id,
'childproc_count':x.childproc_count,
'cmdline':x.cmdline,
'comms_ip':x.comms_ip
'filemod_count':x.filemod_count,
'filtering_known_dlls':x.filtering_known_dlls,
'group':x.group,
'host_type':x.host_type ,
'hostname':x.hostname,
'interface_ip':x.interface_ip,
'last_server_update':x.last_server_update,
'last_update':x.last_update,
'modload_count':x.modload_count,
'netconn_count':x.netconn_count,
'os_type':x.os_type,
'parent_id':x.parent_id,
'parent_md5':x.parnet_md5,
'parent_name':x.parent_name,
'parent_pid':x.parent_pid,
'parent_unique_id':x.parent_unique_id,
'path':x.path,
'process_md5':x.process_md5,
'process_name':x.process_name,
'process_pid':x.process_pid,
'process_sha256':x.process_sha256,
'processblock_count':x.processblock_count,
'regmod_count':x.regmod_count,
'segment_id':x.segment_id,
'sensor_id':x.sensor_id,
'start':x.start,
'terminated':x.terminated,
'unique_id':x.unique_id,
'username':x.username }
}
writer.writerow(dd)
The results for the query object is 465,000 records and takes about 30 minutes + to run this script completely which is not the most efficient.

Related

Inserting into a Cassandra DB is slow even with execute_concurrent()

I am trying to insert a pandas dataframe into cassandra. I am using the execute_concurrent, but I don't see any improvement. It is taking almost 5s per row insertions. There are 14k rows so at this rate it will take more than 15 hours. I have 12 GB RAM with 2 CPU cores. How fast can I run this operation? I've tried with different concurrency numbers but without any success. Following is my code-:
from flask import session
import yaml
import pandas as pd
import argparse
from get_data import read_params
import cassandra
from cassandra.concurrent import execute_concurrent_with_args, execute_concurrent
from cassandra.cluster import Cluster, ExecutionProfile
from cassandra.auth import PlainTextAuthProvider
import sys
import time
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.3+
count = len(it)
def show(j):
x = int(size*j/count)
print("{}[{}{}] {}/{}".format(prefix, u"█"*x, "."*(size-x), j, count),
end='\r', file=out, flush=True)
show(0)
for i, item in enumerate(it):
yield item
show(i+1)
print("\n", flush=True, file=out)
def cassandraDBLoad(config_path):
try:
config = read_params(config_path)
execution_profile = ExecutionProfile(request_timeout=10)
cassandra_config = {'secure_connect_bundle': "path"}
auth_provider = PlainTextAuthProvider(
"client_id",
"client_secret"
)
cluster = Cluster(cloud=cassandra_config, auth_provider=auth_provider)
session = cluster.connect()
session.default_timeout = None
connect_db = session.execute("select release_version from system.local")
set_keyspace = session.set_keyspace("Keyspace Name")
table_ = "big_mart"
define_columns = "Item_Identifier varchar PRIMARY KEY, Item_Weight varchar, Item_Fat_Content varchar, Item_Visibility varchar, Item_Type varchar, Item_MRP varchar, Outlet_Identifier varchar, Outlet_Establishment_Year varchar, Outlet_Size varchar, Outlet_Location_type varchar, Outlet_Type varchar, Item_Outlet_Sales varchar, source varchar"
drop_table = f"DROP TABLE IF EXISTS {table_}"
drop_result = session.execute(drop_table)
create_table = f"CREATE TABLE {table_}({define_columns});"
table_result = session.execute(create_table)
train = pd.read_csv("train_source")
test = pd.read_csv("test_source")
#Combine test and train into one file
train['source']='train'
test['source']='test'
df = pd.concat([train, test],ignore_index=True)
df = df.fillna('NA')
columns = "Item_Identifier, Item_Weight, Item_Fat_Content, Item_Visibility, Item_Type, Item_MRP, Outlet_Identifier, Outlet_Establishment_Year, Outlet_Size, Outlet_Location_Type, Outlet_Type, Item_Outlet_Sales, source"
insert_qry = f"INSERT INTO {table_}({columns}) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)"
statement = session.prepare(insert_qry)
parameters = [
(str(df.iat[i,0]), str(df.iat[i,1]), str(df.iat[i,2]), str(df.iat[i,3]),
str(df.iat[i,4]), str(df.iat[i,5]), str(df.iat[i,6]), str(df.iat[i,7]),
str(df.iat[i,8]), str(df.iat[i,9]), str(df.iat[i,10]), str(df.iat[i,11]),
str(df.iat[i,12]))
for i in range(len(df))]
for i in progressbar(range(len(df)), "Computing: ", 40):
time.sleep(0.1)
execute_concurrent_with_args(
session,
statement,
parameters,
concurrency=500
)
session.execute(batch)
except Exception as e:
raise Exception("(cassandraDBLoad): Something went wrong in the CassandraDB Load operations\n" + str(e))
csv files link - https://drive.google.com/drive/folders/1O03lNTMfSwhUKG61zOs7fNxXIRe44GRp?usp=sharing

Even with concurrent asynchronous requests (execute_concurrent()), it will still be bottlenecked on the client side because there is only so much a single client process can do even when it's multi-threaded.
If you want to maximise the throughput of your cluster, we recommend scaling your app horizontally and run multiple instances (processes). This can be easily achieved with the Python driver using the multiprocessing module. For details, see the Python driver Performance Notes.
Finally, if your goal is to simply bulk-load data to your Cassandra DB, it makes no sense to re-invent the wheel by writing your own application when there are free, open-source tools that exist specifically for this use case.
You can use the DataStax Bulk Loader tool (DSBulk) to bulk load data in CSV format to a Cassandra table. Here are some references with examples to help you get started quickly:
Blog - DSBulk Intro + Loading data
Blog - More DSBulk Loading examples
Blog - Counting records with DSBulk
Docs - Loading data examples
DSBulk is open-source so it's free to use. Cheers!

How can I refresh the data in the background of a running flask app?

I have a simple flask app that queries a database to write a csv then pyplot to create a chart out of that.
I would like to refresh the data in the background every 10 minutes while the app is running. The page doesn't need to refresh the html automatically. It just needs to have fresh data when someone opens the page.
Can I do that in a single script? Or do I need to run a different script outside in crontab or something?
I would just kick over the container every 10 minutes but it takes about 5 minutes to get the query, so that's a 5 minute outage. Not a great idea. I'd prefer it to fetch in the background.
Here is what I'm working with:
import os
from datetime import date
import teradatasql
import pandas as pd
import matplotlib.pyplot as plt
from flask import Flask, render_template
import time
import multitasking
### variables
ausername = os.environ.get('dbuser')
apassword = os.environ.get('dbpassword')
ahost = os.environ.get('dbserver')
systems = ["prd1", "prd2", "frz1", "frz2", "devl"]
qgsystems = ["", "#Tera_Prd2_v2", "#Tera_Frz1_v2", "#Tera_Frz2_v2", "#Tera_Devl_v2"]
weeks = ["0", "7", "30"]
query = """{{fn teradata_write_csv({system}_{week}_output.csv)}}select (bdi.infodata) as sysname,
to_char (thedate, 'MM/DD' ) || ' ' || Cast (thetime as varchar(11)) as Logtime,
sum(drc.cpuuexec)/sum(drc.secs) (decimal(7,2)) as "User CPU",
sum(drc.cpuuserv)/sum(drc.secs) (decimal(7,2)) as "System CPU",
sum(drc.cpuiowait)/sum(drc.secs) (decimal(7,2)) as "CPU IO Wait"
from dbc.resusagescpu{qgsystem} as drc
left outer join boeing_tables.dbcinfotbl{qgsystem} as bdi
on bdi.infokey = 'sysname'
where drc.thedate >= (current_date - {week})
order by logtime asc
Group by sysname,logtime
;
"""
### functions
#multitasking.task
def fetch(system,qgsystem,week):
with teradatasql.connect (host=ahost, user=ausername, password=apassword) as con:
with con.cursor () as cur:
cur.execute (query.format(system=system, qgsystem=qgsystem, week=week))
[ print (row) for row in cur.fetchall () ]
#multitasking.task
def plot(system,week):
for week in weeks:
for system in systems:
df = pd.read_csv(system + "_" + week + "_output.csv")
df.pop('sysname')
df.plot.area(x="Logtime")
figure = plt.gcf()
figure.set_size_inches(12, 6)
plt.savefig( "/app/static/" + system + "_" + week + "_webchart.png", bbox_inches='tight', dpi=100)
### main
for week in weeks:
for system, qgsystem in zip(systems, qgsystems):
fetch(system,qgsystem,week)
for week in weeks:
for system in systems:
plot(system,week)
app = Flask(__name__,template_folder='templates')
#app.route('/')
def index():
return render_template("index.html")

encoding issue when exporting from dataframe in python to MS access

Dears,
i have a python script that have a query that reads from a DB , store the result in a Dataframe , then export it to MS access.
in the loop , it divides the result into 3 files ( each file has different month ) .
the issue in the column : LI_DESC , it have Arabic letter that shows correctly in jupyter , but it shows incorrect char when exported to access .
here is the columns showing correctly in jupyter:
here is the columns shown in access file:
python code:
import cx_Oracle
import os
import accessdb
import pandas as pd
dsn_tns = cx_Oracle.makedsn('10.112.**.****', '1521', service_name='cdwn10g.hq')
conn = cx_Oracle.connect(user='BI', password='BI', dsn=dsn_tns , encoding='utf-8')
sql_query= pd.read_sql_query("""SELECT MONTH1,LI_DESC,PORT,REGS_NUM,REG_DT,CTRY_CD,TAR_CD,UNS_QTY,UN_CD,KGN,KGG,CIF_AMT,CURCY_CD,CURCY_RT
FROM STTS.CDS
WHERE SUBSTR(REG_DT_G,1,6) BETWEEN to_number(extract(year from add_months(sysdate,-3)) || '' || to_char(add_months(sysdate,-3), 'MM')) AND to_number(extract(year from add_months(sysdate,-1)) || '' || to_char(add_months(sysdate,-1), 'MM'))
ORDER BY PORT, REGS_NUM, REG_DT""",conn)
df = pd.DataFrame(sql_query)
from datetime import datetime
today = datetime.now()
if not os.path.exists(r'C:\Users\nalkar\Documents\Python Scripts\RUNDATE'+today.strftime('%Y%m%d')) :
os.makedirs(r'C:\Users\nalkar\Documents\Python Scripts\RUNDATE'+today.strftime('%Y%m%d'))
months= df['MONTH1'].unique().tolist()
for month in months:
mydf=df.loc[df.MONTH1 == month]
mydf.to_accessdb(r"C:\Users\nalkar\Documents\Python Scripts\RUNDATE"+today.strftime('%Y%m%d')+"\%s.accdb" %month, "Data")
print('done')
else:
print(r'directory already exist')

Writing ADODB Recordset to Pivot Cache With Python

I am working on a project where I am converting some VBA code to Python, in order to have Python interact with Excel in much the same way VBA would. In this particular case, I am utilizing the win32com library to have Python extract data from an Oracle Database via an ADODB Connection and write the resulting recordset directly to a pivot cache. I.e. creating a pivot table with data from an external source.
import win32com.client
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application')
win32c = win32com.client.constants
# Create and Open Connection
conn = win32com.client.Dispatch(r'ADODB.Connection')
DSN = 'Provider=OraOLEDB.Oracle; Data Source=localhost:1521/XEPDB1; User Id=system; Password=password;'
conn.Open(DSN)
# Create Excel File
wb = Excel.Workbooks.Add()
Sheet1 = wb.Worksheets("Sheet1")
# Create Recordset
RS = win32com.client.Dispatch(r'ADODB.Recordset')
RS.Open('SELECT * FROM employees', conn, 1, 3)
# Create Pivot Cache
PivotCache = wb.PivotCaches().Create(SourceType=win32c.xlExternal, Version=win32c.xlPivotTableVersion15)
# Write Recordset to Pivot Cache
PivotCache.Recordset = RS # <~~ This is where it breaks!
# Create Pivot Table
Pivot = PivotCache.CreatePivotTable(TableDestination:=Sheet1.Cells(2, 2), TableName:='Python Test Pivot', DefaultVersion:=win32c.xlPivotTableVersion15)
# Close Connection
RS.Close()
conn.Close()
# View Excel
Excel.Visible = 1
I am successful in extracting the data via ADODB and creating an Excel file, but when I try to write the resulting recordset to the pivot cache by setting PivotCache.Recordset = RS, I get the following error.
[Running] venv\Scripts\python.exe "c:\Project\Test\debug_file_test.py"
Traceback (most recent call last):
File "c:\Project\Test\debug_file_test.py", line 29, in <module>
PivotCache.Recordset = RS # <~~ This is where it breaks!
File "c:\Project\venv\lib\site-packages\win32com\client\__init__.py", line 482, in __setattr__
self._oleobj_.Invoke(*(args + (value,) + defArgs))
pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, None, 'No such interface supported\r\n', None, 0, -2146827284), None)
[Done] exited with code=1 in 0.674 seconds
Can anybody shed some light on what I am doing wrong?

I ended up finding a solution to the issue, and want to post an answer for anyone who may come across this question at some point.
Instead of creating the recordset by Recordset.Open() I tried using the command object and create the recordset by cmd.Execute(). As it turns out that Execute returns a tuple, I had to pass cmd.Execute()[0] to the recordset in order to make it work.
This doesn't answer why my initial code doesn't work, but it does provide an answer for how to write an ADODB recordset to a PivotCache with Python.
import win32com.client
#Initiate Excel Application
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application')
win32c = win32com.client.constants
# Create and Open Connection
conn = win32com.client.Dispatch('ADODB.Connection')
cmd = win32com.client.Dispatch('ADODB.Command')
DSN = 'Provider=OraOLEDB.Oracle; Data Source=localhost:1521/XEPDB1; User Id=system; Password=password;'
conn.Open(DSN)
# Define Command Properties
cmd.ActiveConnection = conn
cmd.ActiveConnection.CursorLocation = win32c.adUseClient
cmd.CommandType = win32c.adCmdText
cmd.CommandText = 'SELECT * FROM employees'
# Create Excel File
wb = Excel.Workbooks.Add()
Sheet1 = wb.Worksheets("Sheet1")
# Create Recordset
RS = win32com.client.Dispatch('ADODB.Recordset')
RS = cmd.Execute()[0]
# Create Pivot Cache
PivotCache = wb.PivotCaches().Create(SourceType=win32c.xlExternal, Version=win32c.xlPivotTableVersion15)
PivotCache.Recordset = RS
# Create Pivot Table
Pivot = PivotCache.CreatePivotTable(TableDestination:=Sheet1.Cells(2, 2), TableName:='Python Test Pivot', DefaultVersion:=win32c.xlPivotTableVersion15)
# Close Connection
RS.Close()
conn.Close()
# View Excel
Excel.Visible = 1
Update
As hinted by #Parfait the code above also works if RS = cmd.Execute()[0] is replaced by
RS.Open(cmd)
Which I actually prefer because that secures alignment between the VB syntax and the Python syntax.

Aerospike - Python Client - NoSQL benchmark

I was performing a NoSQL performance benchmark for a client. I was wondering if my Aerospike Python code is optimal? I'm trying to record query time and load time. The data has 500,000 rows and 8 columns. My code is below.
def test_db():
config = {
'hosts': [ ('127.0.0.1', 3000) ]
}
client = aerospike.client(config).connect()
t0 = time.time()
global rec
rec = {}
with open('skunkworks.csv', 'r') as f:
reader = csv.reader(f)
rownum = 0
for row in reader:
# Save First Row with headers
if rownum == 0:
header = row
else:
colnum = 0
for col in row:
rec[header[colnum]] = col
colnum += 1
rownum += 1
if rec:
client.put(('test', 'demo', str(rownum)), rec)
rec = {}
t1 = time.time()
load_time = t1 - t0
t2 = time.time()
for i in range(2,500002):
(key, metadata, record) = client.get(('test', 'demo', str(i)))
# print(record)
t3 = time.time()
read_time = t3 - t2
return [load_time , read_time]

Is your Python application going to run as a single process, or will it be a multi-process approach, such as fastCGI?
If you're trying to benchmark, make sure it's simulating how your application will run. To write lots of rows, then read lots of rows, all from a single process, doesn't usually simulate anything realistic. Perhaps in your case it does, but if not, make your sample code match the real access pattern.
Also, you should deploy your benchmark in a similar way to the application. Don't run benchmarks on the same machine as the server nodes, if that's not how it'll be in production.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to improve runtime on csv function python - python

Related

Inserting into a Cassandra DB is slow even with execute_concurrent()

How can I refresh the data in the background of a running flask app?

encoding issue when exporting from dataframe in python to MS access

Writing ADODB Recordset to Pivot Cache With Python

Aerospike - Python Client - NoSQL benchmark

Categories

Resources