This python script does some data stuff on a product list coming from a sqlite table...the commented out for loop works as expected but the Multiprocessing loop does not work at all... i can see the processess being fired but the script just halts. any help ?
import sqlite3 as lite
import sys
import pandas as pd
import datetime
from datetime import date
import time
from Levenshtein import *
import multiprocessing as mp
import copy
def getProducts():
con = None
try:
con = lite.connect('pm.db', check_same_thread=False)
con.row_factory = lite.Row
cur = con.cursor()
cur.execute("SELECT * FROM products" )
rows = cur.fetchall()
except lite.Error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
finally:
if con:
con.close()
return rows
def test_mp(row):
print row
dictArray = []
counter = 0
rows = getProducts()
#for row in rows:
#counter += 1
#print 'product {count} from {max}'.format(count=counter, max=len(rows))
#dictArray.extend(test_mp(row))
pool = mp.Pool(10)
for ret in pool.imap(test_mp, rows):
print 'Done processing product'
dictArray.extend(ret)
pool.terminate()
This is how i fixed it.. apparently the array of sqliteRows rows is not playing well inside the pool.imap function...i used another row factory to create a generic dict.
def dict_factory(cursor, row):
d = {}
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
in getProducts:
con.row_factory = dict_factory
credits go to How can I get dict from sqlite query?
Related
I am trying to scrape data(liking user of tweets) from twitter and save to a sqlite3 database. The twitter API allows us to do 75 requests at once then raises a "too many requests" error. I want to implement a mechanism to: if there is an error we will wait for 15min then continue. The program should not start over but continue to send the requests where the error was raised and 15min of sleep time.
Any ideas please!!
for the getClient() one should have his own credintials
my code is:
import pandas as pd
import json
def getClient():
client = tweepy.Client(bearer_token=BEARER_TOKEN,
consumer_key=API_KEY,
consumer_secret=API_KEY_SECRET,
access_token=ACCESS_TOKEN,
access_token_secret=ACCESS_TOKEN_SECRET)
return client
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value not in lst2]
return lst3
def addLikers(client):
conn = sqlite3.connect("data.db")
ids = pd.read_sql_query("SELECT tweetId FROM searchTweets", conn)['tweetId'].tolist()
likes = pd.read_sql_query("SELECT tweetId FROM LikingUsers", conn)['tweetId'].tolist()
communVals = intersection(ids, likes)
c = conn.cursor()
for communVal in communVals:
# the function should be here
likingUsers = client.get_liking_users(communVal)
row = [(communVal), (json.dumps(likingUsers.data))]
c.executemany("INSERT INTO searchFollowers VALUES (?,?)", (row,))
conn.commit()
addLikers(client)
For now I am just committing the results until i get the error. To create the table in sqlite3:
import sqlite3
conn = sqlite3.connect("data.db")
c = conn.cursor()
c.execute("""CREATE TABLE LikingUsers (
tweetId integer,
reply text
)""")
you can use signal:
def signal_handler(signum, frame):
raise Exception("Timed out!")
# limit the time before specific function
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(60 * 15) # 15 minuets
The task is very simple - I need to check the amount of rows which having data, get these rows in the list and use cycle to insert every element in BD.
But the problem is in the line data = sheet.range(f"A8:G{last_row}").value, if range is more than 1 row it returns list in list and folowing cycle goes on every list in list and works perfect, but if range contains only 1 raw, it returns just list with values and the following cycle goes on these values and returns some symbols or smth else, but not the values of element of list in list.
How to make it list in list if there is only 1 raw in a source ?
import xlwings as xw
from xlwings import Range, constants
import datetime
import pandas as pd
import psycopg2
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.000')
wb = xw.books.open('CF.xlsm')
sheet = wb.sheets('Inserter')
last_row = sheet.range('A' + str(sheet.cells.last_cell.row)).end('up').row
data = sheet.range(f"A8:G{last_row}").value
print(type(data))
def check_none(x):
x = '' if x is None else x
return x
try:
connection = psycopg2.connect(
host='',
database='',
user='',
password=''
)
connection.autocommit = True
for i in data:
id = i[0]
level1 = check_none(i[1])
level2 = check_none(i[2])
level3 = check_none(i[3])
level4 = check_none(i[4])
amount = check_none(i[5])
comment = check_none(i[6])
with connection.cursor() as cursor:
cursor.execute(f'''
insert into xxxxxx.xxxxxxx (ID, level_1, level_2, level_3, level_4, date, date_eom, amount, comment)
values (
'{id}',
'{level1}',
'{level2}',
'{level3}',
'{level4}',
'{cur_time}',
(date_trunc('month', CURRENT_DATE) + interval '1 month' - interval '1 day')::date,
{amount},
'{comment}')
''')
print("[INFO] Data was succefully inserted")
except Exception as _ex:
print("[INFO] Error while working with PostgreSQL", _ex)
finally:
if connection:
# cursor.close()
connection.close()
print("[INFO] PostgreSQL connection closed")
To always return a nested list, you can set the ndim=2 option:
data = sheet.range(f"A8:G{last_row}").options(ndim=2).value
See also the respective part in the docs: https://docs.xlwings.org/en/stable/datastructures.html#lists
I am querying SQL Server for the list of fields both with threading and without threading.
import pyodbc
import datetime
import concurrent.futures
server = 'xx.xxx.xxx.xxx,1433'
database = 'db'
username = 'user'
password = 'password'
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database + ';UID='+username+';PWD='+password + ';MARS_Connection=yes' + ';Max_Pool_Size=100000')
filter_list = ["department_id", "employee_id", "city", "country", "state", "zip_cope", "department_name", "employee_name", "employee_experience"]
t1 = datetime.datetime.now()
result_list = []
def query_executor(field):
try:
ft1=datetime.datetime.now()
cursor = cnxn.cursor()
result = cursor.execute("""SELECT DISTINCT TOP 1000 [{}] from EMPLOYEE_DETAILS""".format(field))
print(field)
result_list1 = [filter_item[0] for filter_item in result if filter_item[0]]
# print("#############################################")
return {"name": field, "filter_data": result_list1}
except Exception as e:
print(e)
finally:
print("#############################################")
print(datetime.datetime.now()-ft1)
print("#############################################")
cursor.close()
# with threading
with concurrent.futures.ThreadPoolExecutor() as executor:
result = [executor.submit(query_executor, field) for field in filter_list]
for f in concurrent.futures.as_completed(result):
result_list.append(f.result())
print(result_list)
t2 = datetime.datetime.now()
print("#############################################")
print('with threading time taken')
print(t2-t1)
print("#############################################")
#without threading
for f in filter_list:
result_list.append(query_executor(f))
print(result_list)
t2 = datetime.datetime.now()
print("#############################################")
print('without threading time taken')
print(t2-t1)
print("#############################################")
While running, I comment either of one to see the time taken individually with and without threading. But I don't see much time difference. In fact sometime threading one gets slow.
Am I doing something wrong? How will I get the performance boost? filter_list list can grow even bigger sometime, which may lead to slow response.
Thanks in advance!
I am trying to fetch records after a regular interval from a database table which growing with records. I am using Python and its pyodbc package to carry out the fetching of records. While fetching, how can I point the cursor to the next row of the row which was read/fetched last so that with every fetch I can only get the new set of records inserted.
To explain more,
my table has 100 records and they are fetched.
after an interval the table has 200 records and I want to fetch rows from 101 to 200. And so on.
Is there a way with pyodbc cursor?
Or any other suggestion would be very helpful.
Below is the code I am trying:
#!/usr/bin/python
import pyodbc
import csv
import time
conn_str = (
"DRIVER={PostgreSQL Unicode};"
"DATABASE=postgres;"
"UID=userid;"
"PWD=database;"
"SERVER=localhost;"
"PORT=5432;"
)
conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
def fetch_table(**kwargs):
qry = kwargs['qrystr']
try:
#cursor = conn.cursor()
cursor.execute(qry)
all_rows = cursor.fetchall()
rowcnt = cursor.rowcount
rownum = cursor.description
#return (rowcnt, rownum)
return all_rows
except pyodbc.ProgrammingError as e:
print ("Exception occured as :", type(e) , e)
def poll_db():
for i in [1, 2]:
stmt = "select * from my_database_table"
rows = fetch_table(qrystr = stmt)
print("***** For i = " , i , "******")
for r in rows:
print("ROW-> ", r)
time.sleep(10)
poll_db()
conn.close()
I don't think you can use pyodbc, or any other odbc package, to find "new" rows. But if there is a 'timestamp' column in your database, or if you can add such a column (some databases allow for it to be automatically populated as the time of insertion so you don't have to change the insert queries) then you can change your query to select only the rows whose timestamp is greater than the previous timestamp. And you can keep changing the prev_timestamp variable on each iteration.
def poll_db():
prev_timestamp = ""
for i in [1, 2]:
if prev_timestamp == "":
stmt = "select * from my_database_table"
else:
# convert your timestamp str to match the database's format
stmt = "select * from my_database_table where timestamp > " + str(prev_timestamp)
rows = fetch_table(qrystr = stmt)
prev_timestamp = datetime.datetime.now()
print("***** For i = " , i , "******")
for r in rows:
print("ROW-> ", r)
time.sleep(10)
I'm using Python 2.7 and postgresql 9.1.
Trying to get dictionary from query, I've tried the code as described here:
http://wiki.postgresql.org/wiki/Using_psycopg2_with_PostgreSQL
import psycopg2
import psycopg2.extras
conn = psycopg2.connect("dbname=mydb host=localhost user=user password=password")
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cur.execute ("select * from port")
type(cur.fetchall())
It is printing the next answer:
<type 'list'>
printing the item itself, show me that it is list.
The excepted answer was dictionary.
Edit:
Trying the next:
ans = cur.fetchall()[0]
print ans
print type(ans)
returns
[288, 'T', 51, 1, 1, '192.168.39.188']
<type 'list'>
Tnx a lot Andrey Shokhin ,
full answer is:
#!/var/bin/python
import psycopg2
import psycopg2.extras
conn = psycopg2.connect("dbname=uniart4_pr host=localhost user=user password=password")
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cur.execute ("select * from port")
ans =cur.fetchall()
ans1 = []
for row in ans:
ans1.append(dict(row))
print ans1 #actually it's return
It's normal: when you call .fetchall() method returns list of tuples. But if you write
type(cur.fetchone())
it will return only one tuple with type:
<class 'psycopg2.extras.DictRow'>
After this you can use it as list or like dictionary:
cur.execute('SELECT id, msg FROM table;')
rec = cur.fetchone()
print rec[0], rec['msg']
You can also use a simple cursor iterator:
res = [json.dumps(dict(record)) for record in cursor] # it calls .fetchone() in loop
Perhaps to optimize it further we can have
#!/var/bin/python
import psycopg2
import psycopg2.extras
def get_dict_resultset(sql):
conn = psycopg2.connect("dbname=pem host=localhost user=postgres password=Drupal#1008")
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cur.execute (sql)
ans =cur.fetchall()
dict_result = []
for row in ans:
dict_result.append(dict(row))
return dict_result
sql = """select * from tablename"""
return get_dict_resultset(sql)
If you don't want to use a psycopg2.extras.DictCursor you can create a list of dictionaries for the results using cursor.description:
# connect
connection = psycopg2.connect()
cursor = connection.cursor()
# query
cursor.execute("SELECT * FROM myTable")
# transform result
columns = list(cursor.description)
result = cursor.fetchall()
# make dict
results = []
for row in result:
row_dict = {}
for i, col in enumerate(columns):
row_dict[col.name] = row[i]
results.append(row_dict)
# display
print(result)
I use the following function fairly regularly:
def select_query_dict(connection, query, data=[]):
"""
Run generic select query on db, returns a list of dictionaries
"""
logger.debug('Running query: {}'.format(query))
# Open a cursor to perform database operations
cursor = connection.cursor()
logging.debug('Db connection succesful')
# execute the query
try:
logger.info('Running query.')
if len(data):
cursor.execute(query, data)
else:
cursor.execute(query)
columns = list(cursor.description)
result = cursor.fetchall()
logging.debug('Query executed succesfully')
except (Exception, psycopg2.DatabaseError) as e:
logging.error(e)
cursor.close()
exit(1)
cursor.close()
# make dict
results = []
for row in result:
row_dict = {}
for i, col in enumerate(columns):
row_dict[col.name] = row[i]
results.append(row_dict)
return results
In addition to just return only the query results as a list of dictionaries, I would suggest returning key-value pairs (column-name:row-value). Here my suggestion:
import psycopg2
import psycopg2.extras
conn = None
try:
conn = psycopg2.connect("dbname=uniart4_pr host=localhost user=user password=password")
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
cursor.execute("SELECT * FROM table")
column_names = [desc[0] for desc in cursor.description]
res = cursor.fetchall()
cursor.close()
return map(lambda x: dict(zip(column_names, x)), res))
except (Exception, psycopg2.DatabaseError) as e:
logger.error(e)
finally:
if conn is not None:
conn.close()
There is a built in solution to get your result as a collection of dictionary:
from psycopg2.extras import RealDictCursor
cur = conn.cursor(cursor_factory=RealDictCursor)
Modified from: https://www.peterbe.com/plog/from-postgres-to-json-strings, copyright 2013 Peter Bengtsson
For me when I convert the row to dictionary failed (solutions mentioned by others)and also could not use cursor factory.
I am using PostgreSQL 9.6.10, Below code worked for me but I am not sure if its the right way to do it.
def convert_to_dict(columns, results):
"""
This method converts the resultset from postgres to dictionary
interates the data and maps the columns to the values in result set and converts to dictionary
:param columns: List - column names return when query is executed
:param results: List / Tupple - result set from when query is executed
:return: list of dictionary- mapped with table column name and to its values
"""
allResults = []
columns = [col.name for col in columns]
if type(results) is list:
for value in results:
allResults.append(dict(zip(columns, value)))
return allResults
elif type(results) is tuple:
allResults.append(dict(zip(columns, results)))
return allResults
Way to use it:
conn = psycopg2.connect("dbname=pem host=localhost user=postgres,password=Drupal#1008")
cur = conn.cursor()
cur.execute("select * from tableNAme")
resultset = cursor.fetchall()
result = convert_to_dict(cursor.description, resultset)
print(result)
resultset = cursor.fetchone()
result = convert_to_dict(cursor.description, resultset)
print(result)
Contents of './config.py'
#!/usr/bin/python
PGCONF = {
"user": "postgres",
"password": "postgres",
"host": "localhost",
"database": "database_name"
}
contents of './main.py'
#!/usr/bin/python
from config import PGCONF
import psycopg2
import psycopg2.extras
# open connection
conn = psycopg2.connect(**PGCONF)
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
# declare lambda function
fetch_all_as_dict = lambda cursor: [dict(row) for row in cursor]
# execute any query of your choice
cur.execute("""select * from table_name limit 1""")
# get all rows as list of dicts
print(fetch_all_as_dict(cur))
# close cursor and connection
cur.close()
conn.close()