I need to limit the number of results returned for the query
My lambda function is
import json
import boto3
from boto3.dynamodb.conditions import Key, Attr
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('users')
def lambda_handler(event, context):
response = table.scan(
FilterExpression=Attr('name').eq('test')
)
items = response['Items']
return items
Please help me how can i add a limit to the number of results it will return
You can try with pagination.
import boto3
def lambda_handler(event, context):
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
operation_parameters = {
'TableName': 'foo',
'FilterExpression': 'foo > :x',
'ExpressionAttributeValues': {}
}
page_iterator = paginator.paginate(**operation_parameters)
for page in page_iterator:
# do something
boto3 source source2
If you want just limiting techinique take a look at this solution but it uses query method not scan.
def full_query(table, **kwargs):
response = table.query(**kwargs)
items = response['Items']
while 'LastEvaluatedKey' in response:
response = table.query(ExclusiveStartKey=response['LastEvaluatedKey'], **kwards)
items.extend(response['Items'])
return items
full_query(Limit=37, KeyConditions={...})
Source : https://stackoverflow.com/a/56468281/9592801
Related
As there is no support for AWSwrangler by moto i am stuck here and don't know how to mock.
I am trying to unittest my lambda code which run athena query using AWSwrangler.
import awswrangler as wr
import boto3
def athena_query(dbtable, contact_id, athena_output, session):
query = """
SELECT
*
FROM
:dbtable;
WHERE
contactid=:contactid;
"""
output = wr.athena.read_sql_query(
query,
params = {
"contactid": f"'{contact_id}'",
"dbtable": f"{dbtable}"
},
s3_output = athena_output,
boto3_session = session
)
results = output.head().loc[0]
return results
response = athena_query("table_name", "123", "s3://bucket", boto3.session.Session())
I referenced AWSwrangler github issue and while trying some of the test provided in link it's hitting AWS service instead of running locally.
Here is an example implementation for this function using moto and pytest.
First I would correct your function according to awswrangler required parameters in its current version (2.16.1).
import awswrangler as wr
import boto3
def athena_query(database, dbtable, contact_id, athena_output, session):
query = """
SELECT
*
FROM
:dbtable;
WHERE
contactid=:contactid;
"""
output = wr.athena.read_sql_query(
query,
database,
params = {
"contactid": f"'{contact_id}'",
"dbtable": f"{dbtable}"
},
s3_output = athena_output,
boto3_session = session
)
results = output.head().loc[0]
return results
The in a test/conftest.py fil I would declare the necessary mocked objects:
import pytest
import moto
TEST_BUCKET_NAME = "my_bucket"
REGION = "us-east-1"
DATABASE_NAME = "test_db"
TABLE_NAME = "test_table"
TABLE_DDL = f"""CREATE EXTERNAL TABLE IF NOT EXISTS
{DATABASE_NAME}.{TABLE_NAME} (
a string,
b string,
contactid string
) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
'separatorChar' = ',',
'quoteChar' = '\"',
'escapeChar' = '\\'
)
STORED AS TEXTFILE
LOCATION 's3://{TEST_BUCKET_NAME}/input/';"""
#pytest.fixture
def aws_credentials():
"""Mocked AWS Credentials for moto."""
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
os.environ["AWS_SECURITY_TOKEN"] = "testing"
os.environ["AWS_SESSION_TOKEN"] = "testing"
#pytest.fixture
def s3_client(aws_credentials):
with moto.mock_s3():
conn = boto3.client("s3", region_name=REGION)
yield conn
#pytest.fixture
def athena_client(aws_credentials):
with moto.athena.mock_athena():
conn = boto3.client("athena", region_name=REGION)
yield conn
#pytest.fixture
def s3_bucket(s3_client):
s3_client.create_bucket(
Bucket=TEST_BUCKET_NAME,
CreateBucketConfiguration={
'LocationConstraint': 'eu-west-1'
}
)
yield boto3.resource('s3').Bucket(TEST_BUCKET_NAME)
#pytest.fixture
def athena_table(athena_client, s3_bucket):
# create database
_ = athena_client.start_query_execution(
QueryString=f"create database {DATABASE_NAME}",
ResultConfiguration={"OutputLocation": "s3://{TEST_BUCKET_NAME}/queries/"}
)
# create table
_ = athena_client.start_query_execution(
QueryString=TABLE_DDL,
ResultConfiguration={"OutputLocation": "s3://{TEST_BUCKET_NAME}/queries/"}
)
And, then I would define a test of the function in a separate test/athena_test.py file. This is using mocker to mock awswrangler response to the query but you could use advanced testing using the mock objects created in the conftest.py file :
from conftest import TEST_BUCKET_NAME, DATABASE_NAME, TABLE_NAME
# import your function to test here
def test_athena_query(s3_bucket, athena_table, mocker):
def mock_response(*args, **kwargs):
return pd.DataFrame.from_dict({"a": [1, 2], "b": [3, 4], "contactid": [123, 123]})
# mocking
mock_wr_call = mocker.patch('wr.athena.read_sql_query')
mock_wr_call.side_effect = mock_response
response = athena_query(DATABASE_NAME, TABLE_NAME, "123", f"s3://{TEST_BUCKET_NAME}/queries/", boto3.session.Session())
assert response.shape[0] == 2
Resources:
https://aws-data-wrangler.readthedocs.io/en/stable/stubs/awswrangler.athena.read_sql_query.html
I am creating a SAM web app, with the backend being an API in front of a Python Lambda function with a DynamoDB table that maintains a count of the number of HTTP calls to the API. The API must also return this number. The yaml code itself loads normally. My problem is writing the Lambda function to iterate and return the count. Here is my code:
def lambda_handler(event, context):
dynamodb = boto3.resource("dynamodb")
ddbTableName = os.environ["databaseName"]
table = dynamodb.Table(ddbTableName)
# Update item in table or add if doesn't exist
ddbResponse = table.update_item(
Key={"id": "VisitorCount"},
UpdateExpression="SET count = count + :value",
ExpressionAttributeValues={":value": Decimal(context)},
ReturnValues="UPDATED_NEW",
)
# Format dynamodb response into variable
responseBody = json.dumps({"VisitorCount": ddbResponse["Attributes"]["count"]})
# Create api response object
apiResponse = {"isBase64Encoded": False, "statusCode": 200, "body": responseBody}
# Return api response object
return apiResponse
I can get VisitorCount to be a string, but not a number. I get this error: [ERROR] TypeError: lambda_handler() missing 1 required positional argument: 'cou response = request_handler(event, lambda_context)le_event_request
What is going on?
[UPDATE] I found the original error, which was that the function was not properly received by the SAM app. Changing the name fixed this, and it is now being read. Now I have to troubleshoot the actual Python. New Code:
import json
import boto3
import os
dynamodb = boto3.resource("dynamodb")
ddbTableName = os.environ["databaseName"]
table = dynamodb.Table(ddbTableName)
Key = {"VisitorCount": { "N" : "0" }}
def handler(event, context):
# Update item in table or add if doesn't exist
ddbResponse = table.update_item(
UpdateExpression= "set VisitorCount = VisitorCount + :val",
ExpressionAttributeValues={":val": {"N":"1"}},
ReturnValues="UPDATED_NEW",
)
# Format dynamodb response into variable
responseBody = json.dumps({"VisitorCount": ddbResponse["Attributes"]["count"]})
# Create api response object
apiResponse = {"isBase64Encoded": False, "statusCode": 200,"body": responseBody}
# Return api response object
return apiResponse
I am getting a syntax error on Line 13, which is
UpdateExpression= "set VisitorCount = VisitorCount + :val",
But I can't tell where I am going wrong on this. It should update the DynamoDB table to increase the count by 1. Looking at the AWS guide it appears to be the correct syntax.
Not sure what the exact error is but ddbResponse will be like this:
ddbResponse = table.update_item(
Key={
'key1': aaa,
'key2': bbb
},
UpdateExpression= "set VisitorCount = VisitorCount + :val",
ExpressionAttributeValues={":val": Decimal(1)},
ReturnValues="UPDATED_NEW",
)
Specify item to be updated with Key (one item for one Lambda call)
Set Decimal(1) for ExpressionAttributeValues
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GettingStarted.Python.03.html#GettingStarted.Python.03.04
I am creating a lambda function with dynamodb to list items and trying to pass parameters to filter the data. Without passing any filter i am able to get the data with scan method but when passing the filter, getting error. Below is the code that i am trying
from __future__ import print_function
import json
import boto3
from boto3.dynamodb.conditions import Key, Attr
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('table-all')
def scan_table_allpages(self, table_name, filter_key=None, filter_value=None):
table = self.dynamodb_resource.Table(table)
if filter_key and filter_value:
filtering_exp = Key(filter_key).eq(filter_value)
response = table.scan(FilterExpression=filtering_exp)
else:
response = table.scan()
items = response['Items']
while True:
print(len(response['Items']))
if response.get('LastEvaluatedKey'):
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
items += response['Items']
else:
break
return items
# def lambda_handler(event, context):
# print(table.creation_date_time)
# response = table.get_items(
# Key={
# 'Country':event['pathParameters']['USA']
# }
# )
# #response = table.query()
# #print(response)
# return response
scan_table_allpages(self.table,filter_key="Country",filter_value='USA')
trying to get a list of table names in Athena Table using BOTO3 python.
this is my code; I think my attempts to do paginator is not correct. Any help is appreciated
import boto3
client = boto3.client('glue')
responseGetDatabases = client.get_databases()
databaseList = responseGetDatabases['DatabaseList']
for databaseDict in databaseList:
databaseName = databaseDict['Name']
if "dbName_" in databaseName:
print '\ndatabaseName: ' + databaseName
responseGetTables = client.get_tables( DatabaseName = databaseName )
paginator = client.get_paginator(['TableList'])
for page in paginator:
tableList = responseGetTables['TableList']
for tables in tableList:
print tables['Name']
The get_paginator function parameter must be the name of the operation. It looks like you're trying to paginate on the get_tables function so
paginator = client.get_paginator(['TableList'])
should be:
paginator = client.get_paginator('get_tables')
Once you have the paginator object, you need to call paginator.paginate to retrieve the iterator. You can send your database parameters like so:
page_iterator = paginator.paginate(
DatabaseName=databaseDict['Name'],
PaginationConfig={
'MaxItems': 123,
'PageSize': 123,
'StartingToken': 'string'
}
)
See the documention for this function here.
Now that you have the iterator, you can call a for loop by enumerating on it:
for page_index, page in enumerate(page_iterator):
Here's a full working example on how to do it using paginator.
Remember to provide region_name and database_name.
import boto3
region_name = '<PROVIDE_AWS_REGION_NAME>'
database_name = '<PROVIDE_YOUR_DATABASE_NAME>'
catalog_name = 'AwsDataCatalog'
athena = boto3.client('athena', region_name=region_name)
paginator = athena.get_paginator('list_table_metadata')
response_iterator = paginator.paginate(
CatalogName=catalog_name,
DatabaseName=database_name
)
table_names = []
for page in response_iterator:
table_names.extend(
(i['Name'] for i in page['TableMetadataList'])
)
print(table_names)
My table is around 220mb with 250k records within it. I'm trying to pull all of this data into python. I realize this needs to be a chunked batch process and looped through, but I'm not sure how I can set the batches to start where the previous left off.
Is there some way to filter my scan? From what I read that filtering occurs after loading and the loading stops at 1mb so I wouldn't actually be able to scan in new objects.
Any assistance would be appreciated.
import boto3
dynamodb = boto3.resource('dynamodb',
aws_session_token = aws_session_token,
aws_access_key_id = aws_access_key_id,
aws_secret_access_key = aws_secret_access_key,
region_name = region
)
table = dynamodb.Table('widgetsTableName')
data = table.scan()
I think the Amazon DynamoDB documentation regarding table scanning answers your question.
In short, you'll need to check for LastEvaluatedKey in the response. Here is an example using your code:
import boto3
dynamodb = boto3.resource('dynamodb',
aws_session_token=aws_session_token,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region
)
table = dynamodb.Table('widgetsTableName')
response = table.scan()
data = response['Items']
while 'LastEvaluatedKey' in response:
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
data.extend(response['Items'])
DynamoDB limits the scan method to 1mb of data per scan.
Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Client.scan
Here is an example loop to get all the data from a DynamoDB table using LastEvaluatedKey:
import boto3
client = boto3.client('dynamodb')
def dump_table(table_name):
results = []
last_evaluated_key = None
while True:
if last_evaluated_key:
response = client.scan(
TableName=table_name,
ExclusiveStartKey=last_evaluated_key
)
else:
response = client.scan(TableName=table_name)
last_evaluated_key = response.get('LastEvaluatedKey')
results.extend(response['Items'])
if not last_evaluated_key:
break
return results
# Usage
data = dump_table('your-table-name')
# do something with data
boto3 offers paginators that handle all the pagination details for you. Here is the doc page for the scan paginator. Basically, you would use it like so:
import boto3
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
for page in paginator.paginate():
# do something
Riffing off of Jordon Phillips's answer, here's how you'd pass a FilterExpression in with the pagination:
import boto3
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
operation_parameters = {
'TableName': 'foo',
'FilterExpression': 'bar > :x AND bar < :y',
'ExpressionAttributeValues': {
':x': {'S': '2017-01-31T01:35'},
':y': {'S': '2017-01-31T02:08'},
}
}
page_iterator = paginator.paginate(**operation_parameters)
for page in page_iterator:
# do something
Code for deleting dynamodb format type as #kungphu mentioned.
import boto3
from boto3.dynamodb.types import TypeDeserializer
from boto3.dynamodb.transform import TransformationInjector
client = boto3.client('dynamodb')
paginator = client.get_paginator('query')
service_model = client._service_model.operation_model('Query')
trans = TransformationInjector(deserializer = TypeDeserializer())
for page in paginator.paginate():
trans.inject_attribute_value_output(page, service_model)
Turns out that Boto3 captures the "LastEvaluatedKey" as part of the returned response. This can be used as the start point for a scan:
data= table.scan(
ExclusiveStartKey=data['LastEvaluatedKey']
)
I plan on building a loop around this until the returned data is only the ExclusiveStartKey
The 2 approaches suggested above both have problems: Either writing lengthy and repetitive code that handles paging explicitly in a loop, or using Boto paginators with low-level sessions, and foregoing the advantages of higher-level Boto objects.
A solution using Python functional code to provide a high-level abstraction allows higher-level Boto methods to be used, while hiding the complexity of AWS paging:
import itertools
import typing
def iterate_result_pages(function_returning_response: typing.Callable, *args, **kwargs) -> typing.Generator:
"""A wrapper for functions using AWS paging, that returns a generator which yields a sequence of items for
every response
Args:
function_returning_response: A function (or callable), that returns an AWS response with 'Items' and optionally 'LastEvaluatedKey'
This could be a bound method of an object.
Returns:
A generator which yields the 'Items' field of the result for every response
"""
response = function_returning_response(*args, **kwargs)
yield response["Items"]
while "LastEvaluatedKey" in response:
kwargs["ExclusiveStartKey"] = response["LastEvaluatedKey"]
response = function_returning_response(*args, **kwargs)
yield response["Items"]
return
def iterate_paged_results(function_returning_response: typing.Callable, *args, **kwargs) -> typing.Iterator:
"""A wrapper for functions using AWS paging, that returns an iterator of all the items in the responses.
Items are yielded to the caller as soon as they are received.
Args:
function_returning_response: A function (or callable), that returns an AWS response with 'Items' and optionally 'LastEvaluatedKey'
This could be a bound method of an object.
Returns:
An iterator which yields one response item at a time
"""
return itertools.chain.from_iterable(iterate_result_pages(function_returning_response, *args, **kwargs))
# Example, assuming 'table' is a Boto DynamoDB table object:
all_items = list(iterate_paged_results(ProjectionExpression = 'my_field'))
I had some problems with Vincent's answer related to the transformation being applied to the LastEvaluatedKey and messing up the pagination. Solved as follows:
import boto3
from boto3.dynamodb.types import TypeDeserializer
from boto3.dynamodb.transform import TransformationInjector
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
operation_model = client._service_model.operation_model('Scan')
trans = TransformationInjector(deserializer = TypeDeserializer())
operation_parameters = {
'TableName': 'tablename',
}
items = []
for page in paginator.paginate(**operation_parameters):
has_last_key = 'LastEvaluatedKey' in page
if has_last_key:
last_key = page['LastEvaluatedKey'].copy()
trans.inject_attribute_value_output(page, operation_model)
if has_last_key:
page['LastEvaluatedKey'] = last_key
items.extend(page['Items'])
If you are landing here looking for a paginated scan with some filtering expression(s):
def scan(table, **kwargs):
response = table.scan(**kwargs)
yield from response['Items']
while response.get('LastEvaluatedKey'):
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
yield from response['Items']
Example usage:
table = boto3.Session(...).resource('dynamodb').Table('widgetsTableName')
items = list(scan(table, FilterExpression=Attr('name').contains('foo')))
I can't work out why Boto3 provides high-level resource abstraction but doesn't provide pagination. When it does provide pagination, it's hard to use!
The other answers to this question were good but I wanted a super simple way to wrap the boto3 methods and provide memory-efficient paging using generators:
import typing
import boto3
import boto3.dynamodb.conditions
def paginate_dynamodb_response(dynamodb_action: typing.Callable, **kwargs) -> typing.Generator[dict, None, None]:
# Using the syntax from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/dynamodb/GettingStarted/scenario_getting_started_movies.py
keywords = kwargs
done = False
start_key = None
while not done:
if start_key:
keywords['ExclusiveStartKey'] = start_key
response = dynamodb_action(**keywords)
start_key = response.get('LastEvaluatedKey', None)
done = start_key is None
for item in response.get("Items", []):
yield item
## Usage ##
dynamodb_res = boto3.resource('dynamodb')
dynamodb_table = dynamodb_res.Table('my-table')
query = paginate_dynamodb_response(
dynamodb_table.query, # The boto3 method. E.g. query or scan
# Regular Query or Scan parameters
#
# IndexName='myindex' # If required
KeyConditionExpression=boto3.dynamodb.conditions.Key('id').eq('1234')
)
for x in query:
print(x)```