No data is read when using lmdb cursor in Python - python

I have a lmdb database and I'm trying to read its contents. The irony is nothing gets printed on screen. This is the code snippet that I have written for reading from lmdb:
import caffe
import lmdb
import numpy as np
from caffe.proto import caffe_pb2
import cv2
import sys
db_train = lmdb.open('mnist_train_lmdb')
db_train_txn = db_train.begin()
cursor = db_train_txn.cursor()
print db_train
print db_train_txn
print db_train_txn.cursor()
datum = caffe_pb2.Datum()
index = sys.argv[0]
size_train = 50000
size_test = 10000
data_train = np.zeros((size_train, 1, 28, 28))
label_train = np.zeros(size_train, dtype=int)
print 'Reading training data...'
i = -1
for key, value in cursor:
i = i + 1
if i % 1000 == 0:
print i
if i == size_train:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_train[i] = data
label_train[i] = label
This prints :
<Environment object at 0x0000000009CE3990>
<Transaction object at 0x0000000009CE1810>
<Cursor object at 0x0000000009863738>
Reading training data...
Reading test data...
It seems the for loop doesn't run at all. What am I missing here?
I checked and it seems this is the normal way of reading from lmdb, all source examples that I have seen have similar approach.

Correct myself:
Both way of using lmdb.Cursor()
for key, value in cursor:
and
while cursor.next():
are right and I was wrong in the original answer.
You didn't use cursor properly and a slight modification should be made in your code like:
... # original stuff
print 'Reading training data...'
i = -1
while cursor.next(): # Move to the next element, and
i = i + 1 # note cursor starts in an unpositioned state
if i % 1000 == 0:
print i
if i == size_train:
break
datum.ParseFromString(cursor.value())
label = datum.label
data = caffe.io.datum_to_array(datum)
data_train[i] = data
label_train[i] = label
For more usage about lmdb python binding, you can refer here.

OK, it seems the database was faulty! I used another database and it worked just fine. both my code snippet and what was suggested by #DaleSong.

Related

How can i optimize my Python loop for speed

I wrote some code that uses OCR to extract text from screenshots of follower lists and then transfer them into a data frame.
The reason I have to do the hustle with "name" / "display name" and removing blank lines is that the initial text extraction looks something like this:
Screenname 1
name 1
Screenname 2
name 2
(and so on)
So I know in which order each extraction will be.
My code works well for 1-30 images, but if I take more than that its gets a bit slow. My goal is to run around 5-10k screenshots through it at once. I'm pretty new to programming so any ideas/tips on how to optimize the speed would be very appreciated! Thank you all in advance :)
from PIL import Image
from pytesseract import pytesseract
import os
import pandas as pd
from itertools import chain
list_final = [""]
list_name = [""]
liste_anzeigename = [""]
list_raw = [""]
anzeigename = [""]
name = [""]
sort = [""]
f = r'/Users/PycharmProjects/pythonProject/images'
myconfig = r"--psm 4 --oem 3"
os.listdir(f)
for file in os.listdir(f):
f_img = f+"/"+file
img = Image.open(f_img)
img = img.crop((240, 400, 800, 2400))
img.save(f_img)
for file in os.listdir(f):
f_img = f + "/" + file
test = pytesseract.image_to_string(PIL.Image.open(f_img), config=myconfig)
lines = test.split("\n")
list_raw = [line for line in lines if line.strip() != ""]
sort.append(list_raw)
name = {list_raw[0], list_raw[2], list_raw[4],
list_raw[6], list_raw[8], list_raw[10],
list_raw[12], list_raw[14], list_raw[16]}
list_name.append(name)
anzeigename = {list_raw[1], list_raw[3], list_raw[5],
list_raw[7], list_raw[9], list_raw[11],
list_raw[13], list_raw[15], list_raw[17]}
liste_anzeigename.append(anzeigename)
reihenfolge_name = list(chain.from_iterable(list_name))
index_anzeigename = list(chain.from_iterable(liste_anzeigename))
sortieren = list(chain.from_iterable(sort))
print(list_raw)
sort_name = sorted(reihenfolge_name, key=sortieren.index)
sort_anzeigename = sorted(index_anzeigename, key=sortieren.index)
final = pd.DataFrame(zip(sort_name, sort_anzeigename), columns=['name', 'anzeigename'])
print(final)
Use a multiprocessing.Pool.
Combine the code under the for-loops, and put it into a function process_file.
This function should accept a single argument; the name of a file to process.
Next using listdir, create a list of files to process.
Then create a Pool and use its map method to process the list;
import multiprocessing as mp
def process_file(name):
# your code goes here.
return anzeigename # Or watever the result should be.
if __name__ is "__main__":
f = r'/Users/PycharmProjects/pythonProject/images'
p = mp.Pool()
liste_anzeigename = p.map(process_file, os.listdir(f))
This will run your code in parallel in as many cores as your CPU has.
For a N-core CPU this will take approximately 1/N times the time as doing it without multiprocessing.
Note that the return value of the worker function should be pickleable; it has to be returned from the worker process to the parent process.

How to use index in python list?

I am trying to execute the code of pyethereum but when I was analyzing the code in
pyethereum/Ethereum/hybrid_casper/consenus.py
I can't understand where the 'NULL_SENDER' value is defined and how this state.config['NULL_SENDER] will execute.
key, account = state.config['NULL_SENDER'], privtoaddr(state.config['NULL_SENDER'])
Let's look through all the code.
from ethereum import utils, transactions
from ethereum.common import update_block_env_variables
from ethereum.messages import apply_transaction
from ethereum.hybrid_casper import casper_utils
from ethereum.utils import privtoaddr
# Block initialization state transition
def initialize(state, block=None):
config = state.config
state.txindex = 0
state.gas_used = 0
state.bloom = 0
state.receipts = []
if block is not None:
update_block_env_variables(state, block)
# Initalize the next epoch in the Casper contract
if state.block_number % state.config['EPOCH_LENGTH'] == 0 and state.block_number != 0:
key, account = state.config['NULL_SENDER'], privtoaddr(state.config['NULL_SENDER'])
data = casper_utils.casper_translator.encode('initialize_epoch', [state.block_number // state.config['EPOCH_LENGTH']])
transaction = transactions.Transaction(state.get_nonce(account), 0, 3141592,
state.config['CASPER_ADDRESS'], 0, data).sign(key)
success, output = apply_transaction(state, transaction)
assert success
if state.is_DAO(at_fork_height=True):
for acct in state.config['CHILD_DAO_LIST']:
state.transfer_value(
acct,
state.config['DAO_WITHDRAWER'],
state.get_balance(acct))
if state.is_METROPOLIS(at_fork_height=True):
state.set_code(utils.normalize_address(
config["METROPOLIS_STATEROOT_STORE"]), config["METROPOLIS_GETTER_CODE"])
state.set_code(utils.normalize_address(
config["METROPOLIS_BLOCKHASH_STORE"]), config["METROPOLIS_GETTER_CODE"])
We can see that this code is most likely being imported by multiple programs, which it is! https://github.com/ethereum/pyethereum/blob/develop/ethereum/hybrid_casper/chain.py
If we see the usage of the function in chain.py, the state parameter is being fulfilled with self.state, it is self.state = self.mk_poststate_of_blockhash(self.db.get(b'head_hash')).
This function returns a State object, made in ethereum.state, which can be turned into a dictionary. This most likely means that it is getting the value that corresponds to the key called 'NULL_SENDER'.

Python: correct format on input to datagridview

I want to add rows to a datagridview "manually". I tried converting the following code to python: https://learn.microsoft.com/en-us/dotnet/framework/winforms/controls/how-to-manipulate-rows-in-the-windows-forms-datagridview-control
However, I struggle with adding rows. The following doesn't work:
for j in range(len(signals)):
self._dataGridView1.Rows.Add(signals[j])
The following code does work, but is not dynamically enough as I don't know how many elements there will be:
for j in range(len(signals)):
self._dataGridView1.Rows.Add(signals[j][0], signals[j][1], signals[j][2], signals[j][3])
How should I fix this? I tried tuple, but the result were a tuple with all the info shown in the first cell instead of spread over the columns.
I would not like to add packages, as this is to be run within revid dynamo among several users, and I cannot convince everyone to install packages.
full code for context:
import clr
clr.AddReference('System.Windows.Forms')
clr.AddReference('System.Drawing')
clr.AddReference('System.Data')
clr.AddReference('RevitAPIUI')
from Autodesk.Revit.UI import TaskDialog
from System.Windows.Forms import *
from System.Drawing import (
Point, Size,
Font, FontStyle,
GraphicsUnit
)
from System.Data import DataSet
from System.Data.Odbc import OdbcConnection, OdbcDataAdapter
msgBox = TaskDialog
headers = IN[0]
signals = IN[1]
class DataGridViewQueryForm(Form):
def __init__(self):
self.Text = 'Signals'
self.ClientSize = Size(942, 255)
self.MinimumSize = Size(500, 200)
self.setupDataGridView()
def setupDataGridView(self):
self._dataGridView1 = DataGridView()
self._dataGridView1.AllowUserToOrderColumns = True
self._dataGridView1.ColumnHeadersHeightSizeMode = DataGridViewColumnHeadersHeightSizeMode.AutoSize
self._dataGridView1.Dock = DockStyle.Fill
self._dataGridView1.Location = Point(0, 111)
self._dataGridView1.Size = Size(506, 273)
self._dataGridView1.TabIndex = 3
self._dataGridView1.ColumnCount = len(headers)
self._dataGridView1.ColumnHeadersVisible = True
for i in range(len(headers)):
self._dataGridView1.Columns[i].Name = headers[i]
for j in range(len(signals)):
self._dataGridView1.Rows.Add(signals[j][0], signals[j][1], signals[j][2], signals[j][3])
self.Controls.Add(self._dataGridView1)
Application.Run(DataGridViewQueryForm())
Figured it out. Had to use System.Array.
from System import Array
code changes:
array_str = Array.CreateInstance(str, len(headers))
for j in range(len(signals)):
for k in range(len(headers)):
array_str[k] = signals[j][k]
self._dataGridView1.Rows.Add(array_str)

Optimize output of a script by varying input parameters

I have a written a script that uses the code below and I would like to optimize rsi_high and rsi_low to get the best sharpe_ratio:
#
import numpy
import talib as ta
global rsi_high, rsi_low
rsi_high = 63
rsi_low = 41
def myTradingSystem(DATE, OPEN, HIGH, LOW, CLOSE, VOL, exposure, equity, settings):
''' This system uses trend following techniques to allocate capital into the desired equities'''
nMarkets = CLOSE.shape[1] # SHAPE OF NUMPY ARRAY
result, rsi_pos = numpy.apply_along_axis(rsicalc, axis=0, arr=CLOSE)
pos = numpy.asarray(rsi_pos, dtype=numpy.float64)
return pos, settings
def rsicalc(num):
# print rsi_high
try:
rsival = ta.RSI(numpy.array(num,dtype='f8'),timeperiod=14)
if rsival[14] > rsi_high: pos_rsi = 1
elif rsival[14] < rsi_low: pos_rsi = -1
else: pos_rsi = 0
except:
rsival = 0
pos_rsi = 0
return rsival, pos_rsi
def mySettings():
''' Define your trading system settings here '''
settings = {}
# Futures Contracts
settings['markets'] = ['CASH','F_AD', 'F_BO', 'F_BP', 'F_C', 'F_CC', 'F_CD',
'F_CL', 'F_CT', 'F_DX', 'F_EC', 'F_ED', 'F_ES', 'F_FC', 'F_FV', 'F_GC',
'F_HG', 'F_HO', 'F_JY', 'F_KC', 'F_LB', 'F_LC', 'F_LN', 'F_MD', 'F_MP',
'F_NG', 'F_NQ', 'F_NR', 'F_O', 'F_OJ', 'F_PA', 'F_PL', 'F_RB', 'F_RU',
'F_S', 'F_SB', 'F_SF', 'F_SI', 'F_SM', 'F_TU', 'F_TY', 'F_US', 'F_W',
'F_XX', 'F_YM']
settings['slippage'] = 0.05
settings['budget'] = 1000000
settings['beginInSample'] = '19900101'
settings['endInSample'] = '19931231'
settings['lookback'] = 504
return settings
# Evaluate trading system defined in current file.
if __name__ == '__main__':
import quantiacsToolbox
results = quantiacsToolbox.runts(__file__, plotEquity=False)
sharpe_ratio = results['stats']['sharpe']
I suspect that using something like scipy minimize function would do the trick, but I am having trouble understanding how to package my script so that it can be in a usable form.
I have tried putting everything in a function and then running all the code through a number of loops, each time incrementing values but there must be a more elegant way of doing this.
Apologies for posting all my code but I thought it would help if the responder wanted to reproduce my setup and for anyone who is new to quantiacs to see a real example who is faced with the same issue.
Thanks for your help in advance!

How do I gather performance metrics for GDI and user Objects using python

Think this is my first question I have asked on here normally find all the answers I need (so thanks in advance)
ok my problem I have written a python program that will in threads monitor a process and output the results to a csv file for later. This code is working great I am using win32pdhutil for the counters and WMI, Win32_PerfRawData_PerfProc_Process for the CPU %time. I have now been asked to monitor a WPF application and specifically monitor User objects and GDI objects.
This is where I have a problem, it is that i can't seem to find any python support for gathering metrics on these two counters. these two counters are easily available in the task manager I find it odd that there is very little information on these two counters. I am specifically looking at gathering these to see if we have a memory leak, I don't want to install anything else on the system other than python that is already installed. Please can you peeps help with finding a solution.
I am using python 3.3.1, this will be running on a windows platform (mainly win7 and win8)
This is the code i am using to gather the data
def gatherIt(self,whoIt,whatIt,type,wiggle,process_info2):
#this is the data gathering function thing
data=0.0
data1="wobble"
if type=="counter":
#gather data according to the attibutes
try:
data = win32pdhutil.FindPerformanceAttributesByName(whoIt, counter=whatIt)
except:
#a problem occoured with process not being there not being there....
data1="N/A"
elif type=="cpu":
try:
process_info={}#used in the gather CPU bassed on service
for x in range(2):
for procP in wiggle.Win32_PerfRawData_PerfProc_Process(name=whoIt):
n1 = int(procP.PercentProcessorTime)
d1 = int(procP.Timestamp_Sys100NS)
#need to get the process id to change per cpu look...
n0, d0 = process_info.get (whoIt, (0, 0))
try:
percent_processor_time = (float (n1 - n0) / float (d1 - d0)) *100.0
#print whoIt, percent_processor_time
except ZeroDivisionError:
percent_processor_time = 0.0
# pass back the n0 and d0
process_info[whoIt] = (n1, d1)
#end for loop (this should take into account multiple cpu's)
# end for range to allow for a current cpu time rather that cpu percent over sampleint
if percent_processor_time==0.0:
data=0.0
else:
data=percent_processor_time
except:
data1="N/A"
else:
#we have done something wrong so data =0
data1="N/A"
#endif
if data == "[]":
data=0.0
data1="N/A"
if data == "" :
data=0.0
data1="N/A"
if data == " ":
data=0.0
data1="N/A"
if data1!="wobble" and data==0.0:
#we have not got the result we were expecting so add a n/a
data=data1
return data
cheers
edited for correct cpu timings issue if anyone tried to run it :D
so after a long search i was able to mash something together that gets me the info needed.
import time
from ctypes import *
from ctypes.wintypes import *
import win32pdh
# with help from here http://coding.derkeiler.com/Archive/Python/comp.lang.python/2007-10/msg00717.html
# the following has been mashed together to get the info needed
def GetProcessID(name):
object = "Process"
items, instances = win32pdh.EnumObjectItems(None, None, object, win32pdh.PERF_DETAIL_WIZARD)
val = None
if name in instances :
tenQuery = win32pdh.OpenQuery()
tenarray = [ ]
item = "ID Process"
path = win32pdh.MakeCounterPath( ( None, object, name, None, 0, item ) )
tenarray.append( win32pdh.AddCounter( tenQuery, path ) )
win32pdh.CollectQueryData( tenQuery )
time.sleep( 0.01 )
win32pdh.CollectQueryData( tenQuery )
for tencounter in tenarray:
type, val = win32pdh.GetFormattedCounterValue( tencounter, win32pdh.PDH_FMT_LONG )
win32pdh.RemoveCounter( tencounter )
win32pdh.CloseQuery( tenQuery )
return val
processIDs = GetProcessID('OUTLOOK') # Remember this is case sensitive
PQI = 0x400
#open a handle on to the process so that we can query it
OpenProcessHandle = windll.kernel32.OpenProcess(PQI, 0, processIDs)
# OK so now we have opened the process now we want to query it
GR_GDIOBJECTS, GR_USEROBJECTS = 0, 1
print(windll.user32.GetGuiResources(OpenProcessHandle, GR_GDIOBJECTS))
print(windll.user32.GetGuiResources(OpenProcessHandle, GR_USEROBJECTS))
#so we have what we want we now close the process handle
windll.kernel32.CloseHandle(OpenProcessHandle)
hope that helps
For GDI count, I think a simpler, cleaner monitoring script is as follows:
import time, psutil
from ctypes import *
def getPID(processName):
for proc in psutil.process_iter():
try:
if processName.lower() in proc.name().lower():
return proc.pid
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
return None;
def getGDIcount(PID):
PH = windll.kernel32.OpenProcess(0x400, 0, PID)
GDIcount = windll.user32.GetGuiResources(PH, 0)
windll.kernel32.CloseHandle(PH)
return GDIcount
PID = getPID('Outlook')
while True:
GDIcount = getGDIcount(PID)
print(f"{time.ctime()}, {GDIcount}")
time.sleep(1)

Categories