Iterate a pymongo cursor from MLab - python

Why won't the cursor iterate? I feel sure there should be an easy solution.
I have tried multiple Stack Overflow answers and the documentation for Mongodb
https://docs.mongodb.com/getting-started/python/query/
The code is as per below:
from pymongo import MongoClient
#Connect to Mongo Client
client = MongoClient('mongodb://the_username:the_password#ds047124.mlab.com:47124/politicians_from_theage')
db = client.politicians_from_theage #define database used
# Define Collection
collection = db.posts
print collection
Result:
Collection(Database(MongoClient(host=['ds047124.mlab.com:47124'], document_class=dict, tz_aware=False, connect=True), u'politicians_from_theage'), u'posts')
Then the cursor will print its location:
# Define Cursor
my_cursor = collection.find()
print my_cursor
Result:
<pymongo.cursor.Cursor object at 0x0000000003247518>
Then to try and iterate over the cursor provides a timeout:
# Perform query
cursor = db.posts.find()
#Iterate the cursor and print the documents.
for document in cursor:
print(document) #No Luck
Traceback Error or Iteration:
Traceback (most recent call last):
File "C:\PythonC\PythonWebScraping\17_MongoInterface\mongoget.py", line 18, in <module>
for result_object in my_cursor:
File "C:\Python27\lib\site-packages\pymongo\cursor.py", line 1090, in next
if len(self.__data) or self._refresh():
File "C:\Python27\lib\site-packages\pymongo\cursor.py", line 1012, in _refresh
self.__read_concern))
File "C:\Python27\lib\site-packages\pymongo\cursor.py", line 850, in __send_message
**kwargs)
File "C:\Python27\lib\site-packages\pymongo\mongo_client.py", line 827, in _send_message_with_response
server = topology.select_server(selector)
File "C:\Python27\lib\site-packages\pymongo\topology.py", line 210, in select_server
address))
File "C:\Python27\lib\site-packages\pymongo\topology.py", line 186, in select_servers
self._error_message(selector))
pymongo.errors.ServerSelectionTimeoutError: ds047124.mlab.com:47124: timed out
I have tried iterating on 'cursor', 'my_cursor' and 'collection', each of which provides a traceback error of server timeout.
Any help/insight would be greatly appreciated

This may help you:-
# Perform query
cursor = db.posts.find().toAray(function(err, result){
#Iterate the cursor and print the documents.
for document in result:
print(document);
}) //Will give you array of objects.
Let me know if it works.

Found the answer, I was focusing on the cursor rather than loading the object from the cursor from JSON to a list of JSON.
Final code is below (removing the URI)
import json
from datetime import date, timedelta
from pymongo import MongoClient
from bson import json_util
#Connect to Mongo Client
client = MongoClient('mongodb://user:pword#ds047124.mlab.com:47124/politicians_from_theage')
db = client.politicians_from_theage #define database used
print db
# Define Collection
collection = db.posts
print collection # print Collection(Database(MongoClient(host=['ds047124.mlab.com:47124']...
cursor = collection.find()
print cursor
# Obtain json
json_docs = []
for doc in cursor:
json_doc = json.dumps(doc, default=json_util.default)
json_docs.append(json_doc)
print json_docs #json result
# List Comprehension version
#json_docs = [json.dumps(doc, default=json_util.default) for doc in cursor]
#To get back from json again as string list
docs = [json.loads(j_doc, object_hook=json_util.object_hook) for j_doc in json_docs]
print docs
print 'kitty terminates program'

Try this:
cursor = db.posts.find()
for document in list(cursor):
print(document)

Related

SQLAlchemy bulk insert statement in Postgres database throws AttributeError

I am trying to insert rows in Python SQLAlchemy by bulk into a Postgres database by using an insert statement. I need to use the insert statement instead of bulk_insert_mappings, as I want to silently ignore failed insertion of duplicate entries. This was not apparent before, but I have added it now.
The table is created as it should. However, even a very simple insert operation via statement API throws this error:
AttributeError: '_NoResultMetaData' object has no attribute '_indexes_for_keys'
Minimal Verifiable Example:
import os
import sqlalchemy
from sqlalchemy import (
Column,
INTEGER,
TEXT
)
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class Test(Base):
__tablename__ = 'test'
id = Column(INTEGER, primary_key=True)
data = Column(TEXT)
engine = sqlalchemy.create_engine(os.environ['DATABASE_CONNECTION'])
Session = sessionmaker(engine)
Base.metadata.create_all(engine, Base.metadata.tables.values(), checkfirst=True)
connection = engine.connect()
buffer = [
{
'data': "First test"
},
{
'data': "Second test"
}
]
insert_statement = insert(Test).values(buffer)
# Using insert statement instead of bulk_insert_mappings so I can do nothing when adding duplicate entries
insert_or_do_nothing = insert_statement.on_conflict_do_nothing(index_elements=[Company.local_id])
orm_statement = sqlalchemy.select(Test).from_statement(insert_or_do_nothing)
with Session() as session:
session.execute(orm_statement).scalars()
connection.close()
Full stacktrace:
Traceback (most recent call last):
File "/project/path/test.py", line 41, in <module>
session.execute(orm_statement).scalars()
File "/venv/path/sqlalchemy/orm/session.py", line 1715, in execute
result = compile_state_cls.orm_setup_cursor_result(
File "/venv/path/sqlalchemy/orm/context.py", line 354, in orm_setup_cursor_result
return loading.instances(result, querycontext)
File "/venv/path/sqlalchemy/orm/loading.py", line 89, in instances
cursor.close()
File "/venv/path/sqlalchemy/util/langhelpers.py", line 70, in __exit__
compat.raise_(
File "/venv/path/sqlalchemy/util/compat.py", line 208, in raise_
raise exception
File "/venv/path/sqlalchemy/orm/loading.py", line 69, in instances
*[
File "/venv/path/sqlalchemy/orm/loading.py", line 70, in <listcomp>
query_entity.row_processor(context, cursor)
File "/venv/path/sqlalchemy/orm/context.py", line 2627, in row_processor
_instance = loading._instance_processor(
File "/venv/path/sqlalchemy/orm/loading.py", line 715, in _instance_processor
primary_key_getter = result._tuple_getter(pk_cols)
File "/venv/path/sqlalchemy/engine/result.py", line 934, in _tuple_getter
return self._metadata._row_as_tuple_getter(keys)
File "/venv/path/sqlalchemy/engine/result.py", line 106, in _row_as_tuple_getter
indexes = self._indexes_for_keys(keys)
AttributeError: '_NoResultMetaData' object has no attribute '_indexes_for_keys'
Am I misusing the statement interface? The ORM statement looks fine:
INSERT INTO test (data) VALUES (:data_m0), (:data_m1)
I am using
PostgreSQL 14.4
psycopg2-binary 2.9.3
SQLAlchemy 1.4.39
Looking at the docs you could try to use session.bulk_insert_mappings().
buffer = [
{
'data': "First test"
},
{
'data': "Second test"
}
]
with Session() as session:
session.bulk_insert_mappings(Test, buffer)
I found a solution that uses insert statement: Avoid using the ORM statements. For some reason, using plain statements seems to do the job, whilst ORM ones throw the AttributeError.
This is confusing, as the official documentation calls for ORM statements:
# THIS APPROACH DID NOT WORK FOR ME
stmt = stmt.on_conflict_do_update(
index_elements=[User.name], set_=dict(fullname=stmt.excluded.fullname)
).returning(User)
orm_stmt = (
select(User)
.from_statement(stmt)
.execution_options(populate_existing=True)
)
for user in session.execute(
orm_stmt,
).scalars():
print("inserted or updated: %s" % user)
But if you omit the ORM statement part, all is good
# THIS WORKS
insert_statement = insert(Test).values(buffer)
insert_or_do_nothing = insert_statement.on_conflict_do_nothing(index_elements=[Test.id])
with Session() as session:
session.execute(insert_or_do_nothing)
session.commit()

How to execute a MySQL query with a python script using the MySQLdb library?

I tried to modify an ETL but I have found that the old developer executes his commands directly on the connection (the ETL has been running for a few years). When I try to do it myself I get an error (because my compiler expects me to do it from a cursor).
from etl.utils.logging import info
from etl.mysql.connect import db, db_name
from etl.mysql.operations import add_column_if_not_exists
from etl.utils.array import chunks
from pprint import pprint
def add_column_exclude_from_statistics():
with db as c:
# Create new columns where exclude_from_statistics
info("Creating column exclude from statistics")
c.execute("""
UPDATE orders
INNER JOIN contacts ON orders.id = contacts.`Contact ID`
IF contacts.`Great Benefactor` = true OR orders.Campaign = `nuit-pour-la-mission`
SET orders.exclude_from_statistics = 1
ELSE
SET orders.exclude_from_statistics = 0
;
""")
def main():
info("Table crm.orders")
add_column_exclude_from_statistics()
if __name__ == '__main__':
main()
But it returns that 'Connection' object has no attribute 'execute':
(venv) C:\Users\antoi\Documents\Programming\Work\data-tools>py -m etl.task.crm_orders_exclude_from_statistics
2021-06-25 17:12:44.357297 - Connecting to database hozana_data...
2021-06-25 17:12:44.365267 - Connecting to archive database hozana_archive...
2021-06-25 17:12:44.365267 - Table crm.orders
2021-06-25 17:12:44.365267 - Creating column exclude from statistics
Traceback (most recent call last):
File "C:\Users\antoi\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\antoi\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "C:\Users\antoi\Documents\Programming\Work\data-tools\etl\task\crm_orders_exclude_from_statistics.py", line 28, in <module>
main()
File "C:\Users\antoi\Documents\Programming\Work\data-tools\etl\task\crm_orders_exclude_from_statistics.py", line 24, in main
add_column_exclude_from_statistics()
File "C:\Users\antoi\Documents\Programming\Work\data-tools\etl\task\crm_orders_exclude_from_statistics.py", line 12, in add_column_exclude_from_statistic
s
c.execute("""
AttributeError: 'Connection' object has no attribute 'execute'
Here is what we have in etl.mysql.connect
import os
import MySQLdb
from etl.utils.logging import info
db_host = os.environ['DB_HOST']
db_port = int(os.environ['DB_PORT'])
db_user = os.environ['DB_USER']
db_password = os.environ['DB_PASSWORD']
db_name = os.environ['DB_NAME']
db_name_archive = os.environ['DB_ARCHIVE_NAME']
info("Connecting to database {}...".format(db_name))
db = MySQLdb.connect(host=db_host,
port=db_port,
db=db_name,
user=db_user,
passwd=db_password)
It is strange to have done that, isn't it? Is it my MySQLdb library that is not up to date?
Here are the MySQL related libraries. I did not find MySQLdb:
(venv) C:\Users\antoi\Documents\Programming\Work\data-tools>pip list |findstr mysql
mysql 0.0.3
mysql-connector-python 8.0.25
mysqlclient 2.0.3
According to the documentation you first need to create a cursor after the connection is open as the 'Connection' object does not have the execute method but the Cursor one does, therefore using your code sample:
from etl.utils.logging import info
from etl.mysql.connect import db, db_name
from etl.mysql.operations import add_column_if_not_exists
from etl.utils.array import chunks
from pprint import pprint
def add_column_exclude_from_statistics():
with db as c:
# Create new columns where exclude_from_statistics
info("Creating column exclude from statistics")
cursor = c.cursor() #Get the cursor
cursor.execute("""
UPDATE orders
INNER JOIN contacts ON orders.id = contacts.`Contact ID`
IF contacts.`Great Benefactor` = true OR orders.Campaign = `nuit-pour-la-mission`
SET orders.exclude_from_statistics = 1
ELSE
SET orders.exclude_from_statistics = 0
;
""")
def main():
info("Table crm.orders")
add_column_exclude_from_statistics()
if __name__ == '__main__':
main()

Can't get mysql results more than once

I'm trying so hard to get my socket server python script to loop every so often to check for updates in a mysql table.
The code works on the first time no problem. on the second loop and every loop after it throws errors.
Things i've tried:
try/catch (for multiple loops to see if ANY work)
threading
conn.close()
cursor.close() (not cursor.commit() any changes so this through
errors of course)
However, I can put the code in a stand alone file and spam running the file and it works perfectly.
It doesn't seem to like running the sql code in the same process or file (which i thought threading fixed but i guess i was wrong?)
Here is the error: (note the first line is the output i'm trying to print in a loop for testing)
(17, 'Default2', 1, 'uploads/2/projects/5e045c87109820.19290695.blend', '')
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.7_3.7.1776.0_x64__qbz5n2kfra8p0\lib\threading.py", line 926, in _bootstrap_inner
self.run()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.7_3.7.1776.0_x64__qbz5n2kfra8p0\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "D:\xampp\htdocs\urender\serverfiles\test.py", line 7, in func
fqueue = queuedb.checkQueue()
File "D:\xampp\htdocs\urender\serverfiles\queuedb.py", line 7, in checkQueue
cursor = conn.cursor()
File "C:\Users\hackn\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\LocalCache\local-packages\Python37\site-packages\mysql\connector\connection.py", line 806, in cursor
self.handle_unread_result()
File "C:\Users\hackn\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\LocalCache\local-packages\Python37\site-packages\mysql\connector\connection.py", line 1059, in handle_unread_result
raise errors.InternalError("Unread result found")
mysql.connector.errors.InternalError: Unread result found
[Finished in 6.727s]
Here is the basic code from the test.py:
import queuedb
from threading import Thread
import time
def func():
time.sleep(5)
fqueue = queuedb.checkQueue()
return fqueue
func()
fqueue = queuedb.checkQueue()
print(fqueue)
Thread(target=func).start()
This is from my queuedb.py:
from dbconnect import dbconnect
import sys
def checkQueue():
sql = "SELECT * FROM renderqueue WHERE renderer=''"
conn = dbconnect.conn
cursor = conn.cursor()
cursor.execute(sql)
result = cursor.fetchone()
return result
And this is the dbconnect.py:
import mysql.connector
import sys
from xml.dom import minidom
def parseXML():
try:
xmlpath = "urender/serverfiles/dbvariables.xml"
mydoc = minidom.parse(xmlpath)
items = mydoc.getElementsByTagName('item')
dbserver = items[0].attributes['dbserver'].value
dbuser = items[1].attributes['dbuser'].value
dbpass = items[2].attributes['dbpass'].value
dbname = items[3].attributes['dbname'].value
return dbserver, dbuser, dbpass, dbname
except:
print("Something went wrong with the XML DATA")
sys.exit()
dbserver = parseXML()[0]
dbuser = parseXML()[1]
dbpass = parseXML()[2]
dbname = parseXML()[3]
class dbconnect:
conn = mysql.connector.connect(host=dbserver, user=dbuser, passwd=dbpass, database=dbname)
I'm sorry for such a long post but I hope i've explained the problem well enough and given an adequate amount of info.
hckm101,
As indicated by the exception, there are unread rows associated with your cursor.
To solve this, you have two solutions :
Use a buffered cursor, replacing your code with
conn.cursor(buffered=True)
Or, retrieve every result associated to your cursor using a for loop with something like : for row in cursor: dosomething(row)
For more information, there is plenty of documentation available online (here)

Python : MySQLdb cursorclass returns tuples instead of dictionary when set as MySQLdb.cursors.DictCursor

So I am trying to retrieve data from database with MySQLdb in pandas dataframe.
import MySQLdb as mysqldb
import MySQLdb.cursors
import pandas as pd
def connection():
db = mysqldb.connect(
host="123.456.7.890",
user="user",
passwd="password",
db="database",
port=12345,
cursorclass=MySQLdb.cursors.DictCursor
)
return db
mysql = connection()
def testing():
cur = mysql.cursor()
query = cur.execute("select * from table1")
result = cur.fetchall()
cur.close()
result_df = pd.DataFrame(result)
return result_df
When I print the 'testing' function, I get an error:
Traceback (most recent call last):
File "C:/Users/xx/PycharmProjects/practice/python.py", line 97, in <module>
print(testing())
File "C:/Users/xx/PycharmProjects/practice/python.py", line 94, in testing
result_df = pd.DataFrame(result)
File "C:\Users\xx\PycharmProjects\practice\venv\lib\site-packages\pandas\core\frame.py", line 422, in __init__
raise ValueError('DataFrame constructor not properly called!')
ValueError: DataFrame constructor not properly called!
I put cursorclass to MySQLdb.cursors.DictCursor to get the data in dictionary form but instead it seems like I'm getting them in tuple. Currently using python 3.7.
It works when I use pymysql but seems quite slow.

Python multiprocess - PicklingError: Cant pickle

The following class Downloader is supposed to query a SQL database multiple times and store the results in a list of pandas.DataFrame objects.
I would like to use multiprocessing to speed up the retrieval, however I get the error
line 53, in run_queries
dfs_queries = p.map(run_query, queries)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 567, in get
raise self._value
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
I have looked into this question which suggests that the pyodbc connection and cursor objects can not be pickled. Is there a way to still use the pool.map(f, arglist) from multiprocessing when f is reliant on a sql connection?
import pyodbc
from multiprocessing import Pool as ThreadPool
import pandas as pd
class Downloader(object):
def _connect(self, path_db_config):
# ... Loads a config file from which it gets dsn, user and password ... #
con_string = 'DSN=%s;UID=%s;PWD=%s;' % (dsn, user, password)
return pyodbc.connect(con_string)
def run_queries(self):
queries = [# List of sql queries #]
p = ThreadPool(len(queries))
def run_query(query):
cnxn = self._connect(PATH_DB_CONFIG)
df = pd.read_sql(query, cnxn)
return df
return p.map(run_query, queries)
Thanks for the help!!

Categories