Extending Postgresql Collector - python

I am getting two different results with the same query.
I am extending Diamond PostgresqlCollector https://github.com/python-diamond/Diamond/blob/master/src/collectors/postgres/postgres.py in order to track a new metric.
Specifically, I am trying to implement the bloat estimate queries specified here: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/table/table_bloat.sql
Where I am having trouble is that when I run the query from the psql command prompt I get results which include the 'public' schemaname. But when the query is run by diamond there are no results that include 'public'. Instead, enteries are only available for pg_catalog and information_schema. I see this by checking the logs /var/log/upstart/diamond.log
The only cause I can imagine is a permissions error for the 'diamond' user, but I can see at the psql command line that the user diamond exists, and has Superuser privilege. And I get results from pg_catalog. So I can get some stats, but not from the public schema of the database I'm most interested in.
Has anyone extended postgresql collector and seen this behavior or have a suggestion of what to try next?
Adding relevant files here. The system I am testing on is a vagrant machine, but I am using a puppet file to replicate the production environment as close as possible.
/etc/diamond/diamond.conf
[server]
pid_file = /var/run/diamond.pid
collectors_path = /usr/share/diamond/collectors/, /usr/local/share/diamond/collectors/
collectors_config_path = /etc/diamond/collectors/
handlers_path = /usr/share/diamond/handlers/
handlers_config_path = /etc/diamond/handlers/
handlers = diamond.handler.archive.ArchiveHandler
[handlers]
# logging handlers
keys = console
[[default]]
[[GraphitePickleHandler]]
host = graphite-01.local
port = 2014
timeout = 15
batch = 10
# ArchiveHandler writes stats to a local logfile.
# Much easier for testing and debugging.
[[ArchiveHandler]]
keys = watched_file
# File to write archive log files
log_file = /var/log/diamond/archive.log
[collectors]
[[default]]
hostname_method = fqdn_rev
interval = 60
[[CPUCollector]]
enabled = True
percore = True
[[DiskSpaceCollector]]
enabled = False
[[DiskUsageCollector]]
enabled = False
[[LoadAverageCollector]]
enabled = True
[[MemoryCollector]]
enabled = True
[[VMStatCollector]]
enabled = False
[[UserScriptsCollector]]
enabled = True
[loggers]
keys = root
[formatters]
keys = default
[logger_root]
level = INFO
handlers = console
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = default
[handler_watched_file]
class = handlers.WatchedFileHandler
level = DEBUG
formatter = default
[formatter_default]
format = [%(asctime)s] [%(levelname)s] [%(threadName)s] %(message)s
[configs]
path = "/etc/diamond/configs/"
extension = ".conf"
/etc/diamond/configs/postgres-service.conf
[collectors]
# Custom internal Postgresql collector. See diamond-service/files/collectors/custompg/custompg.py
[[CustomPostgresqlCollector]]
enabled = True
interval = 10
extended = True
metrics_blacklist = [^.]+\.indexes.*
pg_version = 9.3
user = diamond
# has_admin currently only controls if diamond should report how many WAL
# files exist on disk (although the query has a bug in it). However, as an
# unprivileged user, diamond can only see queries that are running as the same
# user. So in order to get the full picture of running queries on a multi-user
# system, diamond should have superuser privileges.
has_admin = False
/usr/local/share/diamond/collectors/custompg/custompg.py
import os
import sys
# Make sure we can import the existing postgres collector
try:
import postgres
from postgres import QueryStats, metrics_registry, registry
except ImportError:
# It's likely that this is being imported in a test or script
# outside of the normal diamond runpath.
# In these instances, try to add COLLECTOR_PATH to path and import again.
# i.e. export PYTHONPATH=$PYTHONPATH:/usr/share/diamond/collectors/postgres
raise ImportError("Unable to import built-in postgres collector."
"Make sure the collector path is added to PYTHONPATH.")
class CustomPostgresqlCollector(postgres.PostgresqlCollector):
"""
Collector subclass to differentiate enabling/disabling
company-specific Postgres metric collection.
"""
#Even though nothing is being extended, this class is
# still needed for the additional queries to get picked up
# by Diamond.
pass
class NonVacuumLongestRunningQueries(QueryStats):
"""
Differentiate between vacuum and non-vacuum queries.
The built-in longest running queries metric collection
doesn't account for/filter vacuum operations.
"""
path = "%(datname)s.non_vacuum.longest_running.%(metric)s"
multi_db = True
# This query is a modified version of
# https://github.com/python-diamond/Diamond/blob/0fda1835308255e3ac4b287724340baf16b27bb1/src/collectors/postgres/postgres.py#L506-L519
base_query = """
SELECT 'query',
COALESCE(max(extract(epoch FROM CURRENT_TIMESTAMP-query_start)),0)
FROM pg_stat_activity
WHERE %s
AND %s
UNION ALL
SELECT 'transaction',
COALESCE(max(extract(epoch FROM CURRENT_TIMESTAMP-xact_start)),0)
FROM pg_stat_activity
WHERE 1=1
AND %s
"""
exclude_vacuum_queries = "query NOT LIKE '%VACUUM%'"
# Two query versions in case collector needs to run on Postgres < 9.2
query = base_query % ("current_query NOT LIKE '<IDLE%'",
exclude_vacuum_queries,
exclude_vacuum_queries)
post_92_query = base_query % ("state NOT LIKE 'idle%'",
exclude_vacuum_queries,
exclude_vacuum_queries)
class UserTableVacuumStats(QueryStats):
"""Additional per-table vacuuming stats."""
path = "%(datname)s.tables.%(schemaname)s.%(relname)s.vacuum.%(metric)s"
multi_db = True
# http://www.postgresql.org/docs/9.3/static/monitoring-stats.html#PG-STAT-ALL-TABLES-VIEW
# Also filter out generally non-volatile system tables.
base_query = """
SELECT relname, schemaname, vacuum_count, autovacuum_count
FROM pg_stat_all_tables
WHERE schemaname NOT IN ('pg_catalog', 'information_schema');
"""
query = base_query
class TableBloatSize(QueryStats):
""" Track estimated table bloat size using modified query written by ioguix:
https://github.com/ioguix/pgsql-bloat-estimation/blob/master/table/table_bloat.sql
WARNING: executed with a non-superuser role, the query inspects only
tables you are granted to read.
"""
path = "%(datname)s.tables.%(schemaname)s.%(relname)s.%(metric)s"
multi_db = True
query = """
SELECT schemaname, relname, (tblpages-est_tblpages_ff)*bs AS bloat_size
FROM (
SELECT ceil( reltuples / ( (bs-page_hdr)/tpl_size ) ) + ceil( toasttuples / 4 ) AS est_tblpages,
ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) AS est_tblpages_ff,
tblpages, fillfactor, bs, tblid, schemaname, relname, heappages, toastpages
FROM (
SELECT
( 4 + tpl_hdr_size + tpl_data_size + (2*ma)
- CASE WHEN tpl_hdr_size%ma = 0 THEN ma ELSE tpl_hdr_size%ma END
- CASE WHEN ceil(tpl_data_size)::int%ma = 0 THEN ma ELSE ceil(tpl_data_size)::int%ma END
) AS tpl_size, (heappages + toastpages) AS tblpages, heappages,
toastpages, reltuples, toasttuples, bs, page_hdr, tblid, schemaname, relname, fillfactor
FROM (
SELECT
tbl.oid AS tblid, ns.nspname AS schemaname, tbl.relname AS relname, tbl.reltuples,
tbl.relpages AS heappages, coalesce(toast.relpages, 0) AS toastpages,
coalesce(toast.reltuples, 0) AS toasttuples,
coalesce(substring(
array_to_string(tbl.reloptions, ' ')
FROM '%fillfactor=#"__#"%' FOR '#')::smallint, 100) AS fillfactor,
current_setting('block_size')::numeric AS bs,
CASE WHEN version()~'mingw32' OR version()~'64-bit|x86_64|ppc64|ia64|amd64' THEN 8 ELSE 4 END AS ma,
24 AS page_hdr,
23 + CASE WHEN MAX(coalesce(null_frac,0)) > 0 THEN ( 7 + count(*) ) / 8 ELSE 0::int END
+ CASE WHEN tbl.relhasoids THEN 4 ELSE 0 END AS tpl_hdr_size,
sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024) ) AS tpl_data_size
FROM pg_attribute AS att
JOIN pg_class AS tbl ON att.attrelid = tbl.oid
JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace
JOIN pg_stats AS s ON s.schemaname=ns.nspname
AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname
LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid
WHERE att.attnum > 0 AND NOT att.attisdropped
AND tbl.relkind = 'r'
GROUP BY 1,2,3,4,5,6,7,8,9,10, tbl.relhasoids
ORDER BY 2,3
) AS s
) AS s2
) AS s3
WHERE schemaname='public';
"""
class BtreeBloatSize(QueryStats):
""" Track estimated index bloat size using modified query written by ioguix:
https://github.com/ioguix/pgsql-bloat-estimation/blob/master/btree/btree_bloat.sql
WARNING: executed with a non-superuser role, the query inspect only index on tables you are granted to read.
WARNING: rows with is_na = 't' are known to have bad statistics ("name" type is not supported). Not relevant to Public schema
"""
path = "%(datname)s.tables.%(schemaname)s.%(relname)s.%(indexrelname)s.%(metric)s"
multi_db = True
query = """
SELECT nspname AS schemaname, relname, indexrelname,
bs*(relpages-est_pages_ff) AS bloat_size
FROM (
SELECT coalesce(1 +
ceil(reltuples/floor((bs-pageopqdata-pagehdr)*fillfactor/(100*(4+nulldatahdrwidth)::float))), 0
) AS est_pages_ff,
bs, nspname, relname, indexrelname, relpages, fillfactor
FROM (
SELECT maxalign, bs, nspname, relname, indexrelname, reltuples, relpages, relam, fillfactor,
( index_tuple_hdr_bm +
maxalign - CASE -- Add padding to the index tuple header to align on MAXALIGN
WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
ELSE index_tuple_hdr_bm%maxalign
END
+ nulldatawidth + maxalign - CASE -- Add padding to the data to align on MAXALIGN
WHEN nulldatawidth = 0 THEN 0
WHEN nulldatawidth::integer%maxalign = 0 THEN maxalign
ELSE nulldatawidth::integer%maxalign
END
)::numeric AS nulldatahdrwidth, pagehdr, pageopqdata
FROM (
SELECT
i.nspname, i.relname, i.indexrelname, i.reltuples, i.relpages, i.relam,
current_setting('block_size')::numeric AS bs, fillfactor,
CASE
-- MAXALIGN: 4 on 32bits, 8 on 64bits (and mingw32 ?)
WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
ELSE 4
END AS maxalign,
/* per page header, fixed size: 20 for 7.X, 24 for others */
24 AS pagehdr,
/* per page btree opaque data */
16 AS pageopqdata,
/* per tuple header: add IndexAttributeBitMapData if some cols are null-able */
CASE WHEN max(coalesce(s.null_frac,0)) = 0
-- IndexTupleData size
THEN 2
/* IndexTupleData size + IndexAttributeBitMapData size ( max num filed per index + 8 - 1 /8) */
ELSE 2 + (( 32 + 8 - 1 ) / 8)
END AS index_tuple_hdr_bm,
/* data len: we remove null values save space using it fractionnal part from stats */
sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024)) AS nulldatawidth
FROM pg_attribute AS a
JOIN (
SELECT nspname, tbl.relname AS relname, idx.relname AS indexrelname, idx.reltuples, idx.relpages, idx.relam,
indrelid, indexrelid, indkey::smallint[] AS attnum,
coalesce(substring(
array_to_string(idx.reloptions, ' ')
from 'fillfactor=([0-9]+)')::smallint, 90) AS fillfactor
FROM pg_index
JOIN pg_class idx ON idx.oid=pg_index.indexrelid
JOIN pg_class tbl ON tbl.oid=pg_index.indrelid
JOIN pg_namespace ON pg_namespace.oid = idx.relnamespace
WHERE pg_index.indisvalid AND tbl.relkind = 'r' AND idx.relpages > 0
) AS i ON a.attrelid = i.indexrelid
JOIN pg_stats AS s ON s.schemaname = i.nspname
AND ((s.tablename = i.relname AND s.attname = pg_catalog.pg_get_indexdef(a.attrelid, a.attnum, TRUE)) -- stats from tbl
OR (s.tablename = i.indexrelname AND s.attname = a.attname))-- stats from functionnal cols
JOIN pg_type AS t ON a.atttypid = t.oid
WHERE a.attnum > 0
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
) AS s1
) AS s2
JOIN pg_am am ON s2.relam = am.oid WHERE am.amname = 'btree'
) AS sub
WHERE nspname='public'
ORDER BY 1,2,3;
"""
# Add the new metric queries to the
# registered set used by the collecting method.
metrics_registry.update({
'NonVacuumLongestRunningQueries': NonVacuumLongestRunningQueries,
'UserTableVacuumStats': UserTableVacuumStats,
'TableBloatSize': TableBloatSize,
'BtreeBloatSize': BtreeBloatSize,
})
registry['extended'] += ['NonVacuumLongestRunningQueries',
'UserTableVacuumStats',
'TableBloatSize',
'BtreeBloatSize']

Related

Python Sqllite - UPDATE command executes but doesn't update

import sqlite3 as sql
v = (161.5, 164.5, 157.975, 158.5375, 159.3125, 160.325, 74052, 8)
try:
connection = sql.connect("data.db")
sql_update_query = """UPDATE RECORDS SET OPEN = ?,HIGH = ?,LOW = ?,CLOSE = ?,LAST = ?,PREVCLOSE = ?,TOTTRDQTY = ? WHERE ROWID = ?"""
cursor = connection.cursor()
cursor.execute(sql_update_query,v)
connection.commit()
print("Total", cursor.rowcount, "Records updated successfully")
connection.close()
except Exception as e:
print(e)
Here is the code that I am using to update the data on my table named "RECORDS".
I tried to check if my SQL statement was wrong on DBBrowser:
UPDATE RECORDS SET OPEN = 161.5,HIGH = 164.5,LOW = 157.975,CLOSE = 158.5375,LAST = 159.3125,PREVCLOSE = 160.325,TOTTRDQTY = 74052 WHERE ROWID = 8
Output was:
Execution finished without errors.
Result: query executed successfully. Took 2ms, 1 rows affected
At line 1:
UPDATE RECORDS SET OPEN = 161.5,HIGH = 164.5,LOW = 157.975,CLOSE = 158.5375,LAST = 159.3125,PREVCLOSE = 160.325,TOTTRDQTY = 74052 WHERE ROWID = 8
But when I run my code on python.. it just doesn't update.
I get:
Total 0 Records updated successfully
My python code runs but nothing changes on the database. Please help.
Edit: 29-04-2022:
Since my code is fine, maybe the way my database is created is causing this issue.
So I am adding the code that I use to create the DB file.
import os
import pandas as pd
import sqlite3 as sql
connection = sql.connect("data.db")
d = os.listdir("Bhavcopy/")
for f in d:
fn = "Bhavcopy/" + f
df = pd.read_excel(fn)
df["TIMESTAMP"] = pd.to_datetime(df.TIMESTAMP)
df["TIMESTAMP"] = df['TIMESTAMP'].dt.strftime("%d-%m-%Y")
df.rename(columns={"TIMESTAMP":"DATE"},inplace=True)
df.set_index("DATE",drop=True,inplace=True)
df['CHANGE'] = df.CLOSE - df.PREVCLOSE
df['PERCENT'] = round((df.CHANGE/df.PREVCLOSE) * 100, 2)
df.to_sql('RECORDS', con=connection, if_exists='append')
connection.close()
Sample of data that is being added to the database:
SYMBOL SERIES OPEN ... TIMESTAMP TOTALTRADES ISIN
0 20MICRONS EQ 58.95 ... 01-JAN-2018 1527 INE144J01027
1 3IINFOTECH EQ 8.40 ... 01-JAN-2018 7133 INE748C01020
2 3MINDIA EQ 18901.00 ... 01-JAN-2018 728 INE470A01017
3 5PAISA EQ 383.00 ... 01-JAN-2018 975 INE618L01018
4 63MOONS EQ 119.55 ... 01-JAN-2018 6628 INE111B01023
[5 rows x 13 columns]
SYMBOL SERIES OPEN ... TIMESTAMP TOTALTRADES ISIN
1412 ZODJRDMKJ EQ 43.50 ... 01-JAN-2018 10 INE077B01018
1413 ZUARI EQ 555.00 ... 01-JAN-2018 2097 INE840M01016
1414 ZUARIGLOB EQ 254.15 ... 01-JAN-2018 1670 INE217A01012
1415 ZYDUSWELL EQ 1051.00 ... 01-JAN-2018 688 INE768C01010
1416 ZYLOG EQ 4.80 ... 01-JAN-2018 635 INE225I01026
[5 rows x 13 columns]
Shape of the excel files:
(1417, 13)
Also someone asked how I am creating the table:
import sqlite3 as sql
connection = sql.connect("data.db")
cursor = connection.cursor()
#create our table:
command1 = """
CREATE TABLE IF NOT EXISTS
RECORDS(
DATE TEXT NOT NULL,
SYMBOL TEXT NOT NULL,
SERIES TEXT NOT NULL,
OPEN REAL,
HIGH REAL,
LOW REAL,
CLOSE REAL,
LAST REAL,
PREVCLOSE REAL,
TOTTRDQTY INT,
TOTTRDVAL REAL,
TOTALTRADES INT,
ISIN TEXT,
CHANGE REAL,
PERCENT REAL
)
"""
cursor.execute(command1)
connection.commit()
connection.close()
I created your table with only the numeric fields that needed to be updated, and run your code - it worked. So in the end it had to be a datatype mismatch, I'm glad you found it :)
Your code works fine both in Windows and Linux, the only reason to see that kind of behavior is that you are modifying two files with same name in a different location. Check what file is being referenced in your DBBrowser.
And in doubt prefer absolute paths as in your comment above
connection = sql.connect("C:/Users/Abinash/Desktop/data.db")
So I found the problem why the code even if correct was not working. Thanks to #gimix.
I was creating the variable v:
v = (161.5, 164.5, 157.975, 158.5375, 159.3125, 160.325, 74052, 8)
by read it from a dataframe, when everyone said that my code is correct and when gimix asked "how I created the table", I realized that it could have been a datatype mismatch. On checking I found that one of the values was string type.
so this change:
i = 0
o = float(adjdf['OPEN'].iloc[i])
h = float(adjdf['HIGH'].iloc[i])
l = float(adjdf['LOW'].iloc[i])
c = float(adjdf['CLOSE'].iloc[i])
last = float(adjdf['LAST'].iloc[i])
pc = float(adjdf['PREVCLOSE'].iloc[i])
tq = int(adjdf['TOTTRDQTY'].iloc[i])
did = int(adjdf['ID'].iloc[i])
v = (o,h,l,c,last,pc,tq,did)
This fixed the issue. Thank you very much for the help everyone.
I finally got:
Total 1 Records updated successfully

Operands missing in MySQL statement

I'm getting an arguments error from the MySQL module looking for 4 operands. I just don't see which operands it's looking for. It works for some case types and not others. Line numbers of error (trimmed at bottom which refers to MySQL library):
Traceback (most recent call last):
File "/Users/christoph/PycharmProjects/physicianWorkQueueProject/physicianWorkQueueProject.py", line 158, in <module>
parse()
File "/Users/christoph/PycharmProjects/physicianWorkQueueProject/physicianWorkQueueProject.py", line 138, in parse
c.execute(getPhysiciansql_cmd)
...
mysql.connector.errors.DataError: 1241 (21000): Operand should contain 4 column(s)
#!/usr/bin/python
##################################################
# This is a prototype pathologist work queue management system
# The program calls out to a database of pathologists, their specialities,
# and an index of case types and their intended specialists
#
# Usage: Run at your command line. You will then enter case numbers (which aren't validated (currently))
# and case types (which are validated). The program will distribute the entered case as follows:
#
# If the case is intended for generalists, a system that amounts to names being pulled from a hat
# is employed. When a name is selected (at random), the case is entered into that pathologists' work queue.
# That pathologists' name is not returned to the pool.
# The cycle will then repeat with a random name picked every time, in this same way, for generalist-requiring cases,
# until the entirety of names have been pulled. At this point, all names are returned to the pool and the whole cycle
# begins again
#
# For the specialist requiring cases, the name-draw system is bypassed and the case is directly entered
# into the pathologists' work queue.
# 16 Aug 2018
# My Real Name
##################################################
import mysql.connector as mariaDB
import time
import pandas as pd
from random import choice
def distributefairly(inputCaseNumber, inputcasetype):
# function distrbutefairly does a draw out of a hat, with each name being pulled and the pot shrinking until none are
# left at which point all names are added back
c.execute("SELECT physicianName FROM physicianNamesSpecialties;")
originalPhysicianList = c.fetchall()
physicianList = originalPhysicianList
# print("counter at:",count)
chosenPhysician = choice(physicianList)
pos = physicianList.index(chosenPhysician)
physicianList.pop(pos)
cleanedUpChosenPhysician = chosenPhysician[0]
insert(cleanedUpChosenPhysician)
print("This case is going to", cleanedUpChosenPhysician + ".")
select()
increment()
if len(physicianList) == 0:
reset()
def reset():
# this resets the counter the distributefairly module calls
global count
global physicianList
count = 0
c.execute("SELECT physicianName FROM physicianNamesSpecialties;")
physicianList = c.fetchall()
def increment():
# increments the counter of the distributefairly module
global count
count +=
def print_count():
print(count)
def insert(cleanedUpPhysResult):
# adds inputted cases into the workQueue table
global inputCaseNumber
global inputCaseType
ts = time.gmtime()
readableTs = time.strftime("%Y-%m-%d %H:%M:%S", ts)
c2.execute("INSERT INTO workQueue (name, caseNumber, timestamp, tableCaseType) values (%s,%s,%s,%s)", (cleanedUpPhysResult, inputCaseNumber, readableTs, str(inputCaseType)))
conn2.commit()
def select():
# this outputs the workQueue after every addition
sql = "SELECT * FROM workQueue"
c2.execute(sql)
rWQ = c2.fetchall()
print(pd.DataFrame(rWQ, columns= ['Name','Case Number','Time Stamp','Specialty','Row ID'])) # .set_index('Row ID')
def startup():
# create()
global inputCaseType
global inputCaseNumber
inputCaseNumber = input("Enter Case Number: ")
inputCaseType = input("Enter case type (use proper abbreviations): ")
def parse():
global inputCaseNumber
global inputCaseType
caseInputsql_cmd = "SELECT specialtyRequiredToProcess,Description FROM caseTypes WHERE caseTypeName='{}'".format(inputCaseType)
c.execute(caseInputsql_cmd)
rows_returned = c.fetchall()
if not rows_returned:
print("No match to table of specimen types returned. Check the case type abbreviation and try again.")
return
else:
for row in rows_returned:
r = row[0]
d = row[1]
print("This is a", r, "service case. It is a", d,"type case.")
if r != "GENERALIST":
getPhysiciansql_cmd = "SELECT physicianName FROM physicianNamesSpecialties WHERE (specialty, specialty2, specialty3, specialty4) ='{}'".format(r)
c.execute(getPhysiciansql_cmd)
physResult = choice(c.fetchall())
cleanedUpPhysResult = physResult[0]
print("This case is going to", cleanedUpPhysResult+".")
insert(cleanedUpPhysResult)
select()
else:
distributefairly(inputCaseNumber, inputCaseType)
conn = mariaDB.connect(host='xxxxx', user='xxxxx',password='xxxxxx',db='lookupDB')
conn2 = mariaDB.connect(host='xxxxxxl', user='xxxxx',password='xxxxxxx',db='workQueue')
c = conn.cursor()
c2 = conn2.cursor()
count = 0
while True:
startup()
parse()
This line of code .execute(...) will fail:
getPhysiciansql_cmd =
"SELECT physicianName FROM physicianNamesSpecialties WHERE (specialty, specialty2, specialty3, specialty4) ='{}'".format(r)
c.execute(getPhysiciansql_cmd)
My Var r holds this data:
print("This is a", r, "service case. It is a", d,"type case.")
This is a THORACIC service case. It is a TRANSBRONCHIAL WANG NEEDLE ASPIRATION type case.
My Var getPhysiciansql_cmd holds this data:
getPhysiciansql_cmd = SELECT physicianName FROM
physicianNamesSpecialties WHERE (specialty, specialty2, specialty3, specialty4) ='THORACIC'
I think this is just a matter of how I'm using the WHERE clause. I saw somewhere that the WHERE with multiple fields is written, you use parens around the fields. I've experimented in reducing this to the bare minimum and using the OR operator in the WHERE clause and I have successfully received a proper response.
import mysql.connector as mariaDB
conn = mariaDB.connect(host='xxxxx', user='xxxx',password='xxxxx',db='lookupDB')
conn2 = mariaDB.connect(host='xxxxx', user='xxxx',password='xxxxxx',db='workQueue')
c = conn.cursor()
c2 = conn2.cursor()
inputSpecialty = input("specialty? ")
r = inputSpecialty
c.execute("SELECT physicianName FROM physicianNamesSpecialties WHERE specialty = %s OR specialty2 = %s OR specialty3 = %s OR specialty4 = %s", (r,r,r,r))
physResult = c.fetchall()
cleanedUpPhysResult = physResult
print(cleanedUpPhysResult)
Output:
specialty? THORACIC
[('Song',), ('Han',), ('He',), ('Goldfischer',)]

How to get process's base address with MODULEENTRY32?

I'm looking to do something in this example: Python - How to get the start/base address of a process?. I'm having the same issue as the person in that topic, in that the pointers cheat engine provides is in reference to the base address of the process itself.
I've looked around and it looks like the best solution is to use ctypes and the MODULEENTRY32 to store snapshots of processes and analyze their modBaseAddr.
Here is my current code
import os.path, ctypes, ctypes.wintypes
from ctypes import *
from ctypes.wintypes import *
PROCESS_QUERY_INFORMATION = (0x0400)
PROCESS_VM_OPERATION = (0x0008)
PROCESS_VM_READ = (0x0010)
PROCESS_VM_WRITE = (0x0020)
TH32CS_SNAPMODULE = (0x00000008)
CreateToolhelp32Snapshot= ctypes.windll.kernel32.CreateToolhelp32Snapshot
Process32First = ctypes.windll.kernel32.Process32First
Process32Next = ctypes.windll.kernel32.Process32Next
Module32First = ctypes.windll.kernel32.Module32First
Module32Next = ctypes.windll.kernel32.Module32Next
GetLastError = ctypes.windll.kernel32.GetLastError
OpenProcess = ctypes.windll.kernel32.OpenProcess
GetPriorityClass = ctypes.windll.kernel32.GetPriorityClass
CloseHandle = ctypes.windll.kernel32.CloseHandle
class MODULEENTRY32(Structure):
_fields_ = [ ( 'dwSize' , DWORD ) ,
( 'th32ModuleID' , DWORD ),
( 'th32ProcessID' , DWORD ),
( 'GlblcntUsage' , DWORD ),
( 'ProccntUsage' , DWORD ) ,
( 'modBaseAddr' , POINTER(BYTE)) ,
( 'modBaseSize' , DWORD ) ,
( 'hModule' , HMODULE ) ,
( 'szModule' , c_char * 256 ),
( 'szExePath' , c_char * 260 ) ]
def GetBaseAddr(ProcId, ProcName):
me32 = MODULEENTRY32()
me32.dwSize = sizeof(me32)
hSnapshot = CreateToolhelp32Snapshot( TH32CS_SNAPMODULE, ProcId)
if GetLastError() != 0:
CloseHandle(hSnapshot)
print 'Handle Error %s' % WinError()
return 'Error'
else:
if Module32First(hSnapshot, byref(me32)):
if me32.szModule == ProcName:
CloseHandle(hSnapshot)
return id(me32.modBaseAddr)
else:
Module32Next(hSnapshot, byref(me32))
while int(GetLastError())!= 18:
if me32.szModule == ProcName:
CloseHandle(hSnapshot)
return id(me32.modBaseAddr)
else:
Module32Next(hSnapshot, byref(me32))
CloseHandle(hSnapshot)
print 'Couldn\'t find Process with name %s' % ProcName
else:
print 'Module32First is False %s' % WinError()
CloseHandle(hSnapshot)
def GetProcessIdByName( pName):
if pName.endswith('.exe'):
pass
else:
pName = pName+'.exe'
ProcessIds, BytesReturned = EnumProcesses()
for index in range(BytesReturned / ctypes.sizeof(ctypes.wintypes.DWORD)):
ProcessId = ProcessIds[index]
hProcess = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION, False, ProcessId)
if hProcess:
ImageFileName = (ctypes.c_char*MAX_PATH)()
if ctypes.windll.psapi.GetProcessImageFileNameA(hProcess, ImageFileName, MAX_PATH)>0:
filename = os.path.basename(ImageFileName.value)
if filename == pName:
return ProcessId
CloseHandle(hProcess)
def EnumProcesses():
count = 32
while True:
ProcessIds = (ctypes.wintypes.DWORD*count)()
cb = ctypes.sizeof(ProcessIds)
BytesReturned = ctypes.wintypes.DWORD()
if ctypes.windll.Psapi.EnumProcesses(ctypes.byref(ProcessIds), cb, ctypes.byref(BytesReturned)):
if BytesReturned.value<cb:
return ProcessIds, BytesReturned.value
break
else:
count *= 2
else:
return None
if __name__ == '__main__':
ProcId = GetProcessIdByName('RocketLeague.exe')
#print ProcId
print hex(GetBaseAddr(ProcId, 'RocketLeague.exe'))
#print hex(GetBaseAddr(8252,'RocketLeague.exe'))
Now my understanding of memory isn't the greatest, but I'd figure that the base address should be static while a program is running. When I do get this code to run, the ModBaseAddr I get back changes every time I run it. Another weird Issue I'm having is that without that print ProcId statement, running the program returns an ERROR_ACCESS_DENIED (error 5) from line 41 (This has something to do with the CreateToolhelp32Snapshot function I assume as I have admin rights on the computer). With the print statement, however, the program runs through giving me a different ModBaseAddr every time. If I feed the GetBaseAddr function the ProcessId manually it also works without the print statement, again however, it's giving me a random address every time.
If anyone could provide me any help or point me in the right direction I'd really appreciate it!
Clarification: MODULEENTRY32 stores information about modules, not processes. when you call CreateToolhelp32Snapshot using TH32CS_SNAPMODULE you are getting modules loaded by the process, not processes themselves.
Instead of getting the MODULEENTRY32 in combination with EnumProcesses you can instead use CreateToolHelp32Snapshot with TH32CS_SNAPPROCESS to get a list of processes in the form of PROCESSENRTY32 structs, which also contains the process identifier.
Despite being a user with administrator privileges, you must also run the process as an administrator.
You should also ensure you're initializing your MODULEENTRY32 to {0} for proper error handling and not running into an issue of the returned value being subject to undefined behavior of uninitialized memory.
I do not know the specific cause of your issue but I have used a source code for this purpose that is very robust that may be a plug and play alternative to what you're currently using, the important snippet will follow, but the full source is available here.
def ListProcessModules( ProcessID ):
hModuleSnap = c_void_p(0)
me32 = MODULEENTRY32()
me32.dwSize = sizeof( MODULEENTRY32 )
hModuleSnap = CreateToolhelp32Snapshot( TH32CS_SNAPMODULE, ProcessID )
ret = Module32First( hModuleSnap, pointer(me32) )
if ret == 0 :
print 'ListProcessModules() Error on Module32First[%d]' % GetLastError()
CloseHandle( hModuleSnap )
return False
while ret :
print " MODULE NAME: %s"% me32.szModule
print " executable = %s"% me32.szExePath
print " process ID = 0x%08X"% me32.th32ProcessID
print " ref count (g) = 0x%04X"% me32.GlblcntUsage
print " ref count (p) = 0x%04X"% me32.ProccntUsage
print " base address = 0x%08X"% me32.modBaseAddr
print " base size = %d"% me32.modBaseSize
ret = Module32Next( hModuleSnap , pointer(me32) )
CloseHandle( hModuleSnap )
return True

in python: child processes going defunct while others are not, unsure why

edit: the answer was that the os was axing processes because i was consuming all the memory
i am spawning enough subprocesses to keep the load average 1:1 with cores, however at some point within the hour, this script could run for days, 3 of the processes go :
tipu 14804 0.0 0.0 328776 428 pts/1 Sl 00:20 0:00 python run.py
tipu 14808 64.4 24.1 2163796 1848156 pts/1 Rl 00:20 44:41 python run.py
tipu 14809 8.2 0.0 0 0 pts/1 Z 00:20 5:43 [python] <defunct>
tipu 14810 60.3 24.3 2180308 1864664 pts/1 Rl 00:20 41:49 python run.py
tipu 14811 20.2 0.0 0 0 pts/1 Z 00:20 14:04 [python] <defunct>
tipu 14812 22.0 0.0 0 0 pts/1 Z 00:20 15:18 [python] <defunct>
tipu 15358 0.0 0.0 103292 872 pts/1 S+ 01:30 0:00 grep python
i have no idea why this is happening, attached is the master and slave. i can attach the mysql/pg wrappers if needed as well, any suggestions?
slave.py:
from boto.s3.key import Key
import multiprocessing
import gzip
import os
from mysql_wrapper import MySQLWrap
from pgsql_wrapper import PGSQLWrap
import boto
import re
class Slave:
CHUNKS = 250000
BUCKET_NAME = "bucket"
AWS_ACCESS_KEY = ""
AWS_ACCESS_SECRET = ""
KEY = Key(boto.connect_s3(AWS_ACCESS_KEY, AWS_ACCESS_SECRET).get_bucket(BUCKET_NAME))
S3_ROOT = "redshift_data_imports"
COLUMN_CACHE = {}
DEFAULT_COLUMN_VALUES = {}
def __init__(self, job_queue):
self.log_handler = open("logs/%s" % str(multiprocessing.current_process().name), "a");
self.mysql = MySQLWrap(self.log_handler)
self.pg = PGSQLWrap(self.log_handler)
self.job_queue = job_queue
def do_work(self):
self.log(str(os.getpid()))
while True:
#sample job in the abstract: mysql_db.table_with_date-iteration
job = self.job_queue.get()
#queue is empty
if job is None:
self.log_handler.close()
self.pg.close()
self.mysql.close()
print("good bye and good day from %d" % (os.getpid()))
self.job_queue.task_done()
break
#curtail iteration
table = job.split('-')[0]
#strip redshift table from job name
redshift_table = re.sub(r"(_[1-9].*)", "", table.split(".")[1])
iteration = int(job.split("-")[1])
offset = (iteration - 1) * self.CHUNKS
#columns redshift is expecting
#bad tables will slip through and error out, so we catch it
try:
colnames = self.COLUMN_CACHE[redshift_table]
except KeyError:
self.job_queue.task_done()
continue
#mysql fields to use in SELECT statement
fields = self.get_fields(table)
#list subtraction determining which columns redshift has that mysql does not
delta = (list(set(colnames) - set(fields.keys())))
#subtract columns that have a default value and so do not need padding
if delta:
delta = list(set(delta) - set(self.DEFAULT_COLUMN_VALUES[redshift_table]))
#concatinate columns with padded \N
select_fields = ",".join(fields.values()) + (",\\N" * len(delta))
query = "SELECT %s FROM %s LIMIT %d, %d" % (select_fields, table,
offset, self.CHUNKS)
rows = self.mysql.execute(query)
self.log("%s: %s\n" % (table, len(rows)))
if not rows:
self.job_queue.task_done()
continue
#if there is more data potentially, add it to the queue
if len(rows) == self.CHUNKS:
self.log("putting %s-%s" % (table, (iteration+1)))
self.job_queue.put("%s-%s" % (table, (iteration+1)))
#various characters need escaping
clean_rows = []
redshift_escape_chars = set( ["\\", "|", "\t", "\r", "\n"] )
in_chars = ""
for row in rows:
new_row = []
for value in row:
if value is not None:
in_chars = str(value)
else:
in_chars = ""
#escape any naughty characters
new_row.append("".join(["\\" + c if c in redshift_escape_chars else c for c in in_chars]))
new_row = "\t".join(new_row)
clean_rows.append(new_row)
rows = ",".join(fields.keys() + delta)
rows += "\n" + "\n".join(clean_rows)
offset = offset + self.CHUNKS
filename = "%s-%s.gz" % (table, iteration)
self.move_file_to_s3(filename, rows)
self.begin_data_import(job, redshift_table, ",".join(fields.keys() +
delta))
self.job_queue.task_done()
def move_file_to_s3(self, uri, contents):
tmp_file = "/dev/shm/%s" % str(os.getpid())
self.KEY.key = "%s/%s" % (self.S3_ROOT, uri)
self.log("key is %s" % self.KEY.key )
f = gzip.open(tmp_file, "wb")
f.write(contents)
f.close()
#local saving allows for debugging when copy commands fail
#text_file = open("tsv/%s" % uri, "w")
#text_file.write(contents)
#text_file.close()
self.KEY.set_contents_from_filename(tmp_file, replace=True)
def get_fields(self, table):
"""
Returns a dict used as:
{"column_name": "altered_column_name"}
Currently only the debug column gets altered
"""
exclude_fields = ["_qproc_id", "_mob_id", "_gw_id", "_batch_id", "Field"]
query = "show columns from %s" % (table)
fields = self.mysql.execute(query)
#key raw field, value mysql formatted field
new_fields = {}
#for field in fields:
for field in [val[0] for val in fields]:
if field in exclude_fields:
continue
old_field = field
if "debug_mode" == field.strip():
field = "IFNULL(debug_mode, 0)"
new_fields[old_field] = field
return new_fields
def log(self, text):
self.log_handler.write("\n%s" % text)
def begin_data_import(self, table, redshift_table, fields):
query = "copy %s (%s) from 's3://bucket/redshift_data_imports/%s' \
credentials 'aws_access_key_id=%s;aws_secret_access_key=%s' delimiter '\\t' \
gzip NULL AS '' COMPUPDATE ON ESCAPE IGNOREHEADER 1;" \
% (redshift_table, fields, table, self.AWS_ACCESS_KEY, self.AWS_ACCESS_SECRET)
self.pg.execute(query)
master.py:
from slave import Slave as Slave
import multiprocessing
from mysql_wrapper import MySQLWrap as MySQLWrap
from pgsql_wrapper import PGSQLWrap as PGSQLWrap
class Master:
SLAVE_COUNT = 5
def __init__(self):
self.mysql = MySQLWrap()
self.pg = PGSQLWrap()
def do_work(table):
pass
def get_table_listings(self):
"""Gathers a list of MySQL log tables needed to be imported"""
query = 'show databases'
result = self.mysql.execute(query)
#turns list[tuple] into a flat list
databases = list(sum(result, ()))
#overriding during development
databases = ['db1', 'db2', 'db3']]
exclude = ('mysql', 'Database', 'information_schema')
scannable_tables = []
for database in databases:
if database in exclude:
continue
query = "show tables from %s" % database
result = self.mysql.execute(query)
#turns list[tuple] into a flat list
tables = list(sum(result, ()))
for table in tables:
exclude = ("Tables_in_%s" % database, "(", "201303", "detailed", "ltv")
#exclude any of the unfavorables
if any(s in table for s in exclude):
continue
scannable_tables.append("%s.%s-1" % (database, table))
return scannable_tables
def init(self):
#fetch redshift columns once and cache
#get columns from redshift so we can pad the mysql column delta with nulls
tables = ('table1', 'table2', 'table3')
for table in tables:
#cache columns
query = "SELECT column_name FROM information_schema.columns WHERE \
table_name = '%s'" % (table)
result = self.pg.execute(query, async=False, ret=True)
Slave.COLUMN_CACHE[table] = list(sum(result, ()))
#cache default values
query = "SELECT column_name FROM information_schema.columns WHERE \
table_name = '%s' and column_default is not \
null" % (table)
result = self.pg.execute(query, async=False, ret=True)
#turns list[tuple] into a flat list
result = list(sum(result, ()))
Slave.DEFAULT_COLUMN_VALUES[table] = result
def run(self):
self.init()
job_queue = multiprocessing.JoinableQueue()
tables = self.get_table_listings()
for table in tables:
job_queue.put(table)
processes = []
for i in range(Master.SLAVE_COUNT):
process = multiprocessing.Process(target=slave_runner, args=(job_queue,))
process.daemon = True
process.start()
processes.append(process)
#blocks this process until queue reaches 0
job_queue.join()
#signal each child process to GTFO
for i in range(Master.SLAVE_COUNT):
job_queue.put(None)
#blocks this process until queue reaches 0
job_queue.join()
job_queue.close()
#do not end this process until child processes close out
for process in processes:
process.join()
#toodles !
print("this is master saying goodbye")
def slave_runner(queue):
slave = Slave(queue)
slave.do_work()
There's not enough information to be sure, but the problem is very likely to be that Slave.do_work is raising an unhandled exception. (There are many lines of your code that could do that in various different conditions.)
When you do that, the child process will just exit.
On POSIX systems… well, the full details are a bit complicated, but in the simple case (what you have here), a child process that exits will stick around as a <defunct> process until it gets reaped (because the parent either waits on it, or exits). Since your parent code doesn't wait on the children until the queue is finished, that's exactly what happens.
So, there's a simple duct-tape fix:
def do_work(self):
self.log(str(os.getpid()))
while True:
try:
# the rest of your code
except Exception as e:
self.log("something appropriate {}".format(e))
# you may also want to post a reply back to the parent
You might also want to break the massive try up into different ones, so you can distinguish between all the different stages where things could go wrong (especially if some of them mean you need a reply, and some mean you don't).
However, it looks like what you're attempting to do is duplicate exactly the behavior of multiprocessing.Pool, but have missed the bar in a couple places. Which raises the question: why not just use Pool in the first place? You could then simplify/optimize things ever further by using one of the map family methods. For example, your entire Master.run could be reduced to:
self.init()
pool = multiprocessing.Pool(Master.SLAVE_COUNT, initializer=slave_setup)
pool.map(slave_job, tables)
pool.join()
And this will handle exceptions for you, and allow you to return values/exceptions if you later need that, and let you use the built-in logging library instead of trying to build your own, and so on. And it should only take about a dozens lines of minor code changes to Slave, and then you're done.
If you want to submit new jobs from within jobs, the easiest way to do this is probably with a Future-based API (which turns things around, making the future result the focus and the pool/executor the dumb thing that provides them, instead of making the pool the focus and the result the dumb thing it gives back), but there are multiple ways to do it with Pool as well. For example, right now, you're not returning anything from each job, so, you can just return a list of tables to execute. Here's a simple example that shows how to do it:
import multiprocessing
def foo(x):
print(x, x**2)
return list(range(x))
if __name__ == '__main__':
pool = multiprocessing.Pool(2)
jobs = [5]
while jobs:
jobs, oldjobs = [], jobs
for job in oldjobs:
jobs.extend(pool.apply(foo, [job]))
pool.close()
pool.join()
Obviously you can condense this a bit by replacing the whole loop with, e.g., a list comprehension fed into itertools.chain, and you can make it a lot cleaner-looking by passing "a submitter" object to each job and adding to that instead of returning a list of new jobs, and so on. But I wanted to make it as explicit as possible to show how little there is to it.
At any rate, if you think the explicit queue is easier to understand and manage, go for it. Just look at the source for multiprocessing.worker and/or concurrent.futures.ProcessPoolExecutor to see what you need to do yourself. It's not that hard, but there are enough things you could get wrong (personally, I always forget at least one edge case when I try to do something like this myself) that it's work looking at code that gets it right.
Alternatively, it seems like the only reason you can't use concurrent.futures.ProcessPoolExecutor here is that you need to initialize some per-process state (the boto.s3.key.Key, MySqlWrap, etc.), for what are probably very good caching reasons. (If this involves a web-service query, a database connect, etc., you certainly don't want to do that once per task!) But there are a few different ways around that.
But you can subclass ProcessPoolExecutor and override the undocumented function _adjust_process_count (see the source for how simple it is) to pass your setup function, and… that's all you have to do.
Or you can mix and match. Wrap the Future from concurrent.futures around the AsyncResult from multiprocessing.

Retrieve User Entry IDs from MAPI

I extended the win32comext MAPI with the Interface IExchangeModifyTable to edit ACLs via the MAPI. I can modify existing ACL entries, but I stuck in adding new entries. I need the users entry ID to add it, according this C example
(Example Source from MSDN)
STDMETHODIMP AddUserPermission(
LPSTR szUserAlias,
LPMAPISESSION lpSession,
LPEXCHANGEMODIFYTABLE lpExchModTbl,
ACLRIGHTS frights)
{
HRESULT hr = S_OK;
LPADRBOOK lpAdrBook;
ULONG cbEid;
LPENTRYID lpEid = NULL;
SPropValue prop[2] = {0};
ROWLIST rowList = {0};
char szExName[MAX_PATH];
// Replace with "/o=OrganizationName/ou=SiteName/cn=Recipients/cn="
char* szServerDN = "/o=org/ou=site/cn=Recipients/cn=";
strcpy(szExName, szServerDN);
strcat(szExName, szUserAlias);
// Open the address book.
hr = lpSession->OpenAddressBook(0,
0,
MAPI_ACCESS_MODIFY,
&lpAdrBook );
if ( FAILED( hr ) ) goto cleanup;
// Obtain the entry ID for the recipient.
hr = HrCreateDirEntryIdEx(lpAdrBook,
szExName,
&cbEid,
&lpEid);
if ( FAILED( hr ) ) goto cleanup;
prop[0].ulPropTag = PR_MEMBER_ENTRYID;
prop[0].Value.bin.cb = cbEid;
prop[0].Value.bin.lpb = (BYTE*)lpEid;
prop[1].ulPropTag = PR_MEMBER_RIGHTS;
prop[1].Value.l = frights;
rowList.cEntries = 1;
rowList.aEntries->ulRowFlags = ROW_ADD;
rowList.aEntries->cValues = 2;
rowList.aEntries->rgPropVals = &prop[0];
hr = lpExchModTbl->ModifyTable(0, &rowList);
if(FAILED(hr)) goto cleanup;
printf("Added user permission. \n");
cleanup:
if (lpAdrBook)
lpAdrBook->Release();
return hr;
}
I can open the Address Book, but HrCreateDirEntryIdEx is not provided in the pywin32 mapi. I found it in the exchange extension, which does not compile on my system, the missing library problem. Do you have any idea to retrieve the users entry ID?
Thank.
Patrick
I got this piece of code and it works fine
from binascii import b2a_hex, a2b_hex
import active_directory as ad
# entry_type, see http://msdn.microsoft.com/en-us/library/cc840018.aspx
# + AB_DT_CONTAINER 0x000000100
# + AB_DT_TEMPLATE 0x000000101
# + AB_DT_OOUSER 0x000000102
# + AB_DT_SEARCH 0x000000200
# ab_flags, maybe see here: https://svn.openchange.org/openchange/trunk/libmapi/mapidefs.h
def gen_exchange_entry_id(user_id, ab_flags=0, entry_type = 0):
muidEMSAB = "DCA740C8C042101AB4B908002B2FE182"
version = 1
# Find user and bail out if it's not there
ad_obj = ad.find_user(user_id)
if not ad_obj:
return None
return "%08X%s%08X%08X%s00" % (
ab_flags,
muidEMSAB,
version,
entry_type,
b2a_hex(ad_obj.legacyExchangeDN.upper()).upper(),
)
data = gen_exchange_entry_id("myusername")
print data
print len(a2b_hex(data))

Categories