not able to undertand cursors in appengine - python

I'm trying to fetch results in a python2.7 appengine app using cursors, but each time I use with_cursor() it fetches the same result set.
query = Model.all().filter("profile =", p_key).order('-created')
if r.get('cursor'):
query = query.with_cursor(start_cursor = r.get('cursor'))
cursor = query.cursor()
objs = query.fetch(limit=10)
count = len(objs)
for obj in objs:
...
Each time through I'm getting same 10 results. I'm thinkng it has to do with using end_cursor, but how do I get that value if query.cursor() is returning the start_cursor. I've looked through the docs but this is poorly documented.

Your formatting is a bit screwy by the way. Looking at your code (which is incomplete and therefore potentially leaving something out.) I have to assume you have forgotten to store the cursor after fetching results (or return to the user - I am assuming r is a request ?).
So after you have fetched some data you need to call cursor() on the query. e.g This function counts all entities using a cursor.
def count_entities(kind):
c = None
count = 0
q = kind.all(keys_only=True)
while True:
if c:
q.with_cursor(c)
i = q.fetch(1000)
count = count + len(i)
if not i:
break
c = q.cursor()
return count
See how after fetch() has been called the c=q.cursor() call and it's is used as the cursor next time through the loop.

Here's what finally worked:
query = Model.all().filter("profile =", p_key).order('-created')
if request.get('cursor'):
query = query.with_cursor(request.get('cursor'))
objs = query.fetch(limit=10)
cursor = query.cursor()
for obj in objs:
...

Related

Python: How can I append without overriding past append for loop

I am currently trying to append to the output list in my code the id of the query result. I can get it to do one of the ids but it will override the first one how can I change my code to allow any amount of looping to the output.append(q.id)
Here is the code:
#app.route('/new-mealplan', methods=['POST'])
def create_mealplan():
data = request.get_json()
recipes = data['recipes']
output = []
for recipe in recipes:
try:
query = Recipes.query.filter(func.lower(Recipes.recipe_name) == func.lower(recipe)).all()
# print(recipe)
if query:
query = Recipes.query.filter(func.lower(Recipes.recipe_name) == func.lower(recipe)).all()
for q in query:
output.append(q.id)
finally:
return jsonify({"data" : output})
To fix this I removed the
Try and Finally blocks.
Then returned after the for-loop was completed.

Python3.6 function not called

I'm developing a REST API in Python 3.6 using Flask-Rebar and PostgreSQL and am having trouble trying to execute some queries simultaneously using psycopg2.
More specifically, I execute a query and require the id value from this query for use in the next query. The first query successfully returns the expected value, however the function that calls the subsequent query doesn't even execute.
Here is the function responsible for calling the query function:
psql = PostgresHandler()
user_ids = [1, 5, 9]
horse = {"name": "Adam", "age": 400}
def createHorseQuery(user_ids, horse):
time_created = strftime("%Y-%m-%dT%H:%M:%SZ")
fields, values = list(), list()
for key, val in horse.items():
fields.append(key)
values.append(val)
fields.append('time_created')
values.append(time_created)
fields = str(fields).replace('[', '(').replace(']', ')').replace("'", "")
values = str(values).replace('[', '(').replace(']', ')')
create_horse_query = f"INSERT INTO horse {fields} VALUES {values} RETURNING horse_id;"
horse_id = None
for h_id in psql.queryDatabase(create_horse_query, returnInsert=True):
horse_id = h_id
link_user_query = ''
for u_id in user_ids:
link_user_query += f"INSERT INTO user_to_horse (user_id, horse_id) VALUES ({u_id}, {horse_id['horse_id']});"
psql.queryDatabase(link_user_query)
return horse_id, 201
Here is the PostgresHandler() class that contains the function queryDatabase:
class PostgresHandler(object):
def __init__(self):
self.connectToDatabase()
def connectToDatabase(self):
self.connection = psycopg2.connect(
host = '...',
user = '...',
password = '...',
database = '...'
)
def queryDatabase(self, query, returnInsert=False):
cursor = self.connection.cursor(cursor_factory=RealDictCursor)
cursor.execute(query)
if "SELECT" in query.upper():
for result in cursor.fetchall():
yield result
elif "INSERT" in query.upper():
if returnInsert:
for result in cursor.fetchall():
yield result
self.connection.commit()
cursor.close()
I can verify that the psql.queryDatabase(create_horse_query, returnInsert=True) operation is successful by querying the database manually and comparing against the return value, h_id.
I can verify that link_user_query is created and contains the user_ids and horse_id as expected by printing. I know the query that's generated is okay as I have tested this manually in the database.
It appears that the function called on the line psql.queryDatabase(link_user_query) is never actually called as a print statement at the very top of the queryDatabase function does not get executed.
I've tried with delays between the two query function calls, initialising a new connection with each function call and many other things to no avail and I am absolutely stumped. Any insight is greatly appreciated.
EDIT: FYI, The createHorseQuery function returns successfully and displays the two returned values as expected.
queryDatabase in your code is a generator because it contains a yield statement. The generator only actually does things when you iterate over it (i.e. cause __next__() to be called). Consider the following:
def gen():
print("Gen is running!")
yield "Gen yielded: hello"
print("Gen did: commit")
print("***Doing stuff with b***")
b = gen()
for a in b:
print(a)
print("***Doing stuff with c***")
c = gen()
print("***Done***")
Output is:
***Doing stuff with b***
Gen is running!
Gen yielded: hello
Gen did: commit
***Doing stuff with c***
***Done***
When we called gen() to create c we didn't actually run it, we just instantiated it as a generator.
We could force it to run by calling __next__() on it a bunch of times:
c.__next__()
try:
c.__next__()
except StopIteration:
print("Iteration is over!")
outputs:
Gen is running!
Gen did: commit
Iteration is over!
But really, you should probably not use a generator like this where you are never intending to yield from it. You could consider adding a new function which is not a generator called insertSilently (or similar).

How to use bulk upsert in a loop?

The fields that I have in Mongoldb are;
id, website_url, status.
I need to find the website_url and update its status to 3 and add a new field called err_desc.
I have a list of website_urls, its status and its err_desc.
Below is my code.
client = MongoClient('localhost', 9000)
db1 = client['Company_Website_Crawl']
collection1 = db1['All']
posts1 = collection1.posts
bulk = posts1.initialize_ordered_bulk_op()
website_url = ["http://www.example.com","http://example2.com/"]
err_desc = ["error1","error2"]
for i in website_url:
parsed_uri = urlparse(i)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
final_url = domain
final_url_strip = domain.rstrip("/")
print i,final_url,final_url_strip,"\n"
try:
k = bulk.find({'website_url':i}).upsert().update({'$push':{'err_desc':err_desc,'status':3}})
k = bulk.execute()
print k
except Exception as e:
print "fail"
print e
Error
fail batch op errors occurred
fail Bulk operations can only be executed once.
Initially I used
k = posts1.update({'website_url':final_url_strip},{'$set':{'err_desc':err_desc,'status':3}},multi=True)
It was too slow for 5M records. So I wanted to use bulk update option. Kindly help me to use bulk upsert for this scenario.
The error message is telling you that you need to re-initialize the batch writes operation after calling execute(). But the thing is, you are doing it wrong. In you case, you need to call execute at the end of the for loop like this:
from itertools import count
ct = count()
for url in website_url:
...
try:
bulk.find({'website_url':i}).upsert().update({'$push':{'err_desc':err_desc,'status':3}})
val = next(ct)
except Exception as e:
...
if val > 0:
bulk.execute()
Also note that Bulk() is now deprecated and replaced with bulkwrite

Control the value returned by a spawned process

In the code below, the worker function checks if the data passed is valid and if it is valid, it returns a dictionary which will be used in a bulk SQLAlchemy Core insert. If its invalid, I want the None value not to be added to the receiving_list because if it is, the bulk insert will fail as a single None value cannot map out to the table structure.
from datetime import datetime
from sqlalchemy import Table
import multiprocessing
CONN = Engine.connect() #Engine is imported from another module
NUM_CONSUMERS = multiprocessing.cpu_count()
p = multiprocessing.Pool(NUM_CONSUMERS)
def process_data(data):
#Long process to validate data
if is_valid_data(data) == True:
returned_dict = {}
returned_dict['created_at'] = datetime.now()
returned_dict['col1'] = data[0]
returned_dict['colN'] = data[N]
return returned_dict
else:
return None
def spawn_some_processes(data):
table_to_insert = Table('postgresql_database_table', meta, autoload=True, autoload_with=Engine)
While True:
#Get some data here and pass it on to the worker
receiving_list = p.map(process_data, data_to_process)
try:
if len(receiving_list) > 0:
trans = CONN.begin()
CONN.execute(table_to_insert.insert(), receiving_list)
trans.commit()
except IntegrityError:
trans.rollback()
except:
trans.rollback()
Trying to rephrase the question, how can I stop a spawned process from adding to receiving_list when the value None is returned by the spawned process?
A workaround is incorporating a queue with queue.put() and queue.get() that will put only valid data. The disadvantage with this is that after the processes are over, I have to then unpack the queue which adds overhead. My ideal solution would be one where a clean list of dictionaries is returned which SQLAlchemy can use to do the bulk insert
You can just remove the None entries from the list:
received_list = filter(None, p.map(process_data, data_to_process))
This is pretty quick even for really huge lists:
>>> timeit.timeit('l = filter(None, l)', 'l = range(0,10000000)', number=1)
0.47683095932006836
Note that using filter will remove anything where bool(val) is False, like empty strings, empty lists, etc. This should be fine for your use-case, though.

Limit calls to external database with Python CGI

I've got a Python CGI script that pulls data from a GPS service; I'd like this information to be updated on the webpage about once every 10s (the max allowed by the GPS service's TOS). But there could be, say, 100 users viewing the webpage at once, all calling the script.
I think the users' scripts need to grab data from a buffer page that itself only upates once every ten seconds. How can I make this buffer page auto-update if there's no one directly viewing the content (and not accessing the CGI)? Are there better ways to accomplish this?
Cache the results of your GPS data query in a file or database (sqlite) along with a datetime.
You can then do a datetime check against the last cached datetime to initiate another GPS data query.
You'll probably run into concurrency issues with cgi and the datetime check though...
To get around concurrency issues, you can use sqlite, and put the write in a try/except.
Here's a sample cache implementation using sqlite.
import datetime
import sqlite3
class GpsCache(object):
db_path = 'gps_cache.db'
def __init__(self):
self.con = sqlite3.connect(self.db_path)
self.cur = self.con.cursor()
def _get_period(self, dt=None):
'''normalize time to 15 minute periods'''
if dt.minute < 15:
minute_period = 0
elif 15 <= dt.minute < 30:
minute_period = 15
elif 30 <= dt_minute < 45:
minute_period = 30
elif 45 <= dt_minute:
minute_period = 25
period_dt = datetime.datetime(year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=minute_period)
return period_dt
def get_cache(dt=None):
period_dt = self._get_period(dt)
select_sql = 'SELECT * FROM GPS_CACHE WHERE date_time = "%s";' % period_dt.strftime('%Y-%m-%d %H:%M')
self.cur.execut(select_sql)
result = self.cur.fetchone()[0]
return result
def put_cache(dt=None, data=None):
period_dt = self._get_period(dt)
insert_sql = 'INSERT ....' # edit to your table structure
try:
self.cur.execute(insert_sql)
self.con.commit()
except sqlite3.OperationalError:
# assume db is being updated by another process with the current resutls and ignore
pass
So we have the cache tool now the implementation side.
You'll want to check the cache first then if it's not 'fresh' (doens't return anything), go grab the data using your current method. Then cache the data you grabbed.
you should probably organize this better, but you should get the general idea here.
Using this sample, you just replace your current calls to 'remote_get_gps_data' with 'get_gps_data'.
from gps_cacher import GpsCache
def remote_get_gps_data():
# your function here
return data
def get_gps_data():
data = None
gps_cache = GpsCache()
current_dt = datetime.datetime.now()
cached_data = gps_cache.get_cache(current_dt)
if cached_data:
data = cached_data
else:
data = remote_get_gps_data()
gps_cache.put_cache(current_dt, data)
return data

Categories