I'm trying to mock textract using moto
I have lambda_function:
from textractcaller.t_call import call_textract, Textract_Features
def lambda_function(event,context)
s3InputDocPath = "s3://test"
jsonObject = call_textract(
input_document = s3InputDocPath,
features = [Textract_Features.FORMS, Textract_Features.Tables],
job_done_polling_interval=5,
)
textract.py
#pytest.fixtures
def callTextract()
textract = boto3.client("textract", region_name="us-east-1")
bucket = "inputBucket"
textract.analyze_document(
Document = {"S3Object":{"Bucket":bucket, "Name" = "test_doc.pdf"}},
FeatureTypes=["TABLES", "FORMS"]
)
finally my testfile:
test_lambda_function.py
#mock_textract
#mock_s3
def test_lambda_function(event, callTextract, s3_object)
callTextract()
s3_object()
result = lambda_function(event, None)
But textract calls are not getting mocked
I'm getting below error
NotImplementedError: The analyze_document has not been Implemented
Anyone can help please?
You can patch unsupported features like this:
import boto3
import botocore
from unittest.mock import patch
orig = botocore.client.BaseClient._make_api_call
def mock_make_api_call(self, operation_name, kwarg):
if operation_name == 'AnalyzeDocument':
# Make whatever changes you expect to happen during this operation
return {
"expected": "response",
"for this": "operation"
}
# If we don't want to patch the API call, call the original API
return orig(self, operation_name, kwarg)
#mock_textract
#mock_s3
def test_lambda_function(event, callTextract, s3_object)
with patch('botocore.client.BaseClient._make_api_call', new=mock_make_api_call):
callTextract()
s3_object()
result = lambda_function(event, None)
This allows you to completely customize the boto3 behaviour, on a per-method basis.
Taken (and modified to fit your usecase) from the Moto documentation here: http://docs.getmoto.org/en/latest/docs/services/patching_other_services.html
Related
I am currently trying to write unit tests for my python code using Moto & #mock_dynamodb2 . So far it's been working for me to test my "successful operation" test cases. But I'm having trouble getting it to work for my "failure cases".
In my test code I have:
#mock_dynamodb2
class TestClassUnderTestExample(unittest.TestCase):
def setUp(self):
ddb = boto3.resource("dynamodb", "us-east-1")
self.table = ddb.create_table(<the table definition)
self.example_under_test = ClassUnderTestExample(ddb)
def test_some_thing_success(self):
expected_response = {<some value>}
assert expected_response = self.example_under_test.write_entry(<some value>)
def test_some_thing_success(self):
response = self.example_under_test.write_entry(<some value>)
# How to assert exception is thrown by forcing put item to fail?
The TestClassUnderTestExample would look something like this:
class ClassUnderTestExample:
def __init__(self, ddb_resource=None):
if not ddb_resource:
ddb_resource = boto3.resource('dynamodb')
self.table = ddb_resource.Table(.....)
def write_entry(some_value)
ddb_item = <do stuff with some_value to create sanitized item>
response = self.table.put_item(
Item=ddb_item
)
if pydash.get(response, "ResponseMetadata.HTTPStatusCode") != 200:
raise SomeCustomErrorType("Unexpected response from DynamoDB when attempting to PutItem")
return ddb_item
I've been completely stuck when it comes to actually mocking the .put_item operation to return a non-success value so that I can test that the ClassUnderTestExample will handle it as expected and throw the custom error. I've tried things like deleting the table before running the test, but that just throws an exception when getting the table rather than an executed PutItem with an error code.
I've also tried putting a patch for pydash or for the table above the test but I must be doing something wrong. I can't find anything in moto's documentation. Any help would be appreciated!
The goal of Moto is to completely mimick AWS' behaviour, including how to behave when the user supplies erroneous inputs. In other words, a call to put_item() that fails against AWS, would/should also fail against Moto.
There is no build-in way to force an error response on a valid input.
It's difficult to tell from your example how this can be forced, but it looks like it's worth playing around with this line to create an invalid input:
ddb_item = <do stuff with some_value to create sanitized item>
Yes, you can. Use mocking for this. Simple and runnable example:
from unittest import TestCase
from unittest.mock import Mock
from uuid import uuid4
import boto3
from moto import mock_dynamodb2
def create_user_table(table_name: str) -> dict:
return dict(
TableName=table_name,
KeySchema=[
{
'AttributeName': 'id',
'KeyType': 'HASH'
},
],
AttributeDefinitions=[
{
'AttributeName': 'id',
'AttributeType': 'S'
},
],
BillingMode='PAY_PER_REQUEST'
)
class UserRepository:
table_name = 'users'
def __init__(self, ddb_resource):
if not ddb_resource:
ddb_resource = boto3.resource('dynamodb')
self.table = ddb_resource.Table(self.table_name)
def create_user(self, username):
return self.table.put_item(Item={'id': str(uuid4), 'username': username})
#mock_dynamodb2
class TestUserRepository(TestCase):
def setUp(self):
ddb = boto3.resource("dynamodb", "us-east-1")
self.table = ddb.create_table(**create_user_table('users'))
self.test_user_repo = UserRepository(ddb)
def tearDown(self):
self.table.delete()
def test_some_thing_success(self):
user = self.test_user_repo.create_user(username='John')
assert len(self.table.scan()['Items']) == 1
def test_some_thing_failure(self):
self.test_user_repo.table = table = Mock()
table.put_item.side_effect = Exception('Boto3 Exception')
with self.assertRaises(Exception) as exc:
self.test_user_repo.create_user(username='John')
self.assertTrue('Boto3 Exception' in exc.exception)
Requesting assistance with the following error.
An error occurred (ModelError) when calling the InvokeEndpoint
operation: Received client error (415) from model with message
"Content-type application/octet-stream not supported. Supported
content-type is text/csv, text/libsvm"
Here is the relevant code -
from sagemaker import image_uris
from sagemaker.estimator import Estimator
xgboost_hyperparameters = {
"max_depth":"5",
"eta":"0.2",
"gamma":"4",
"min_child_weight":"6",
"subsample":"0.7",
"num_round":"50"
}
xgboost_image = image_uris.retrieve("xgboost", boto3.Session().region_name, version="1")
estimator = Estimator(image_uri = xgboost_image,
hyperparameters = xgboost_hyperparameters,
role = role,
instance_count=1,
instance_type='ml.m5.2xlarge',
output_path= output_loc,
volume_size=5 )
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
train_input = sagemaker.inputs.TrainingInput(s3_data = train_loc, content_type='text/csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.inputs.TrainingInput(s3_data = validation_loc, content_type='text/csv',s3_data_type = 'S3Prefix')
estimator.CONTENT_TYPE = 'text/csv'
estimator.serializer = CSVSerializer()
estimator.deserializer = None
estimator.fit({'train':train_input, 'validation': valid_input})
# deploy model with data config
from sagemaker.model_monitor import DataCaptureConfig
from time import gmtime, strftime
s3_capture_upload_path = 's3://{}/{}/monitoring/datacapture'.format(bucket, prefix)
model_name = 'project3--model-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_name = 'project3-endpoint'
data_capture_configuration = DataCaptureConfig(
enable_capture = True,
sampling_percentage=100,
destination_s3_uri=s3_capture_upload_path )
deploy = estimator.deploy(initial_instance_count = 1,
instance_type = 'ml.m4.xlarge' ,
data_capture_config=data_capture_configuration,
model_name=model_name,
endpoint_name = endpoint_name
)
Then I face the error in the Predictor
from sagemaker.predictor import Predictor
predictor = Predictor(endpoint_name=endpoint_name)
with open('test.csv', 'r') as f:
for row in f:
print(row)
payload = row.rstrip('\n')
response = predictor.predict(data=payload[2:])
sleep(0.5)
print('done!')
I looked at these links but haven't found an answer
https://github.com/aws-samples/reinvent2019-aim362-sagemaker-debugger-model-monitor/blob/master/02_deploy_and_monitor/deploy_and_monitor.ipynb
How can I specify content_type in a training job of XGBoost from Sagemaker in Python?
https://github.com/aws/amazon-sagemaker-examples/issues/729
First, please make sure which SDK version you are using. AWS made breaking changes between 1.x and 2.x. Even worse, the sagemaker SDK on notebook instance can be different from that in the sagemaker studio depending on the regions.
Please see How to use Serializer and Deserializer in Sagemaker 2 as well as AWS changed serialize/deserialize stuff.
Predictors
Behavior for serialization of input data and deserialization of result data can be configured through initializer arguments.
class sagemaker.serializers.CSVSerializer(content_type='text/csv')
Please try:
from sagemaker.serializers import CSVSerializer
predictor.serializer = CSVSerializer()
Or by setting None to the serializer, you can have full control over serialize/deserialize in your code.
predictor.serializer=None
As seen here, predict()'s initial_args is "Default arguments for boto3 invoke_endpoint call.
For me, this worked:
predictor.predict(review_input, initial_args={'ContentType': 'text/plain'})
Using the reference API for predictor here with initial_args
predictor.predict(review_input, initial_args={'ContentType':
'text/csv'})
I am a beginner when it comes to writing tests and mocking.
I have a created two modules. One module object (Site) creates another object from my second module (Item) on init. The Item object makes a call to an API endpoint to get some data using requests.
I want to Mock the API call I am making so I can test things like a bad response and importantly have control over the response data.
I have simplified my code and put below. When I run the test I get back the actual response data and not what I have mocked.
I have a feeling I am not putting my Mock in the right place. Also, I have seen lots of people saying to use #unittest.patch annotation. I am not clear if I should be using that here.
So I am looking for how to get _get_range_detail to actually return a Mocked response from requests and also just general feedback on if it looks like I am approaching this the right way.
# hello_world.py
from mymodule.site import Site
sites = [
dict(
name="site1",
ranges=[
"range1",
"range2"
]
)
]
site_object = Site(sites[0]['name'], sites[0]['ranges'])
for i in site_object.get_ranges_objects():
print(i.range_detail)
# site.py
from mymodule.item import Item
class Site:
def __init__(self, name, ranges):
self.name = name
self.ranges = ranges
self.ranges_objects = []
for my_range in ranges:
self.ranges_objects.append(Item(my_range))
def get_ranges_objects(self):
return self.ranges_objects
# item.py
import requests
class Item:
def __init__(self, range_name):
self.range_name = range_name
self.range_detail = self._get_range_detail(self.range_name)
def _get_range_detail(self, range_name):
uri = "https://postman-echo.com/get?some_cool_value=real_value"
try:
r = requests.get(uri)
if r.status_code == 200:
return r.json()['args']
else:
return None
except Exception as e:
print(e)
exit(1)
# test_site.py
import pytest
from mymodule.site import Site
from unittest import mock
from mymodule.item import requests
def test_get_ranges_objects():
sites = [
dict(
name="site1",
ranges=[
"range1",
"range2"
]
)
]
requests = mock.Mock()
requests.status_code = 200
requests.json.return_value = {
'args': {'some_mock_value': 'mocky'}
}
site_object = Site(sites[0]['name'], sites[0]['ranges'])
assert site_object.name == "site1"
assert isinstance(site_object.ranges_objects, list)
assert site_object.ranges_objects[0].range_detail == dict(some_mock_value='mocky')
You can use pytest-mock. it makes mocking in pytest simple. (pip install pytest-mock)
You should replace requests.get. Simply
requests_get = mock.patch('requests.get').start()
If you use pytest-mock,
requests_get = mocker.patch('requests.get')
Rewrote test case using pytest-mock
# test_site.py
import pytest
from mymodule.site import Site
from unittest import mock
#pytest.fixture
def requests_get(mocker):
requests_get = mocker.patch('requests.get')
yield requests_get
def test_get_ranges_objects(mocker, requests_get):
response = mocker.MagicMock()
response.status_code = 200
response.json.return_value = {'args': {'some_mock_value': 'mocky'}}
requests_get.return_value = response
sites = [
dict(
name="site1",
ranges=[
"range1",
"range2"
]
)
]
site_object = Site(sites[0]['name'], sites[0]['ranges'])
assert site_object.name == "site1"
assert isinstance(site_object.ranges_objects, list)
assert site_object.ranges_objects[0].range_detail == {'some_mock_value': 'mocky'}
Following is the Lambda function, I wrote that gets the list of Autoscaling group and prints them.
import json
import boto3
import boto.ec2.autoscale
role = "arn:aws:iam::XXXXXXXXXX:role/lambda-autoshutdown-role"
regions = ["eu-central-1"]
autoscaling = boto3.client('autoscaling')
class App(object):
def __init__(self,RoleArn):
self.RoleArn = RoleArn
if self.RoleArn != "local":
sts_client = boto3.client('sts')
self.sts = sts_client.assume_role(
RoleArn=self.RoleArn,
RoleSessionName="lambda_poweroff")["Credentials"]
def get_resource(self,region="eu-central-1"):
if self.RoleArn == "local":
return boto3.resource(region_name=region)
else:
return boto.ec2.autoscale.connect_to_region(
region_name=region,
aws_access_key_id=self.sts['AccessKeyId'],
aws_secret_access_key=self.sts['SecretAccessKey'],)
def lambda_handler(event, context):
a = App(role)
for region in regions:
asgs = a.get_resource(region)
# locate all running instances
#autoscaling_groups_to_suspend = []
#for i in asgs:
# print asgs[i]
print '[%s]' % ', '.join(map(str, asgs))
This function uses: boto.ec2.autoscale.connect_to_region to connect and returns the object.
But when I try to deploy it on AWS, I get the following error:
Unable to import module 'lambda_function': No module named boto.ec2.autoscale
It seems like the class boto.ec2.autoscale is not being loaded by AWS.
Any idea what might be wrong here?
For someone looking for an answer, the following piece of Lambda code gets the lists of all ASG's and then suspends them (except the ones that match the regrex)
import json
import boto3
regions = ["eu-central-1"]
autoscaling = boto3.client('autoscaling')
def lambda_handler(event, context):
response = autoscaling.describe_auto_scaling_groups(MaxRecords=100)
#print response
#print(response['AutoScalingGroups'][0]['AutoScalingGroupName'])
autoscaling_group_to_suspend = []
for doc in response['AutoScalingGroups']:
response_parsed = doc['AutoScalingGroupName']
autoscaling_group_to_suspend.append(response_parsed)
#print autoscaling_group_to_suspend
import re
regex = re.compile(r'es-data-asg|consul|influxdb|vault|es-master')
filtered = filter(lambda i: not regex.search(i), autoscaling_group_to_suspend)
filtered = [i for i in autoscaling_group_to_suspend if not regex.search(i)]
print filtered
if len(filtered) > 0:
for x in filtered:
autoscaling.suspend_processes(AutoScalingGroupName=x)
I am trying to do the same thing with s3. I need boto.s3.connect_to_region() but I get the same error. Probably importing the boto module in the lambda dependency might solve the issue. Else, we might have to use boto3.client and parse the json response to get the appropriate values.
My table is around 220mb with 250k records within it. I'm trying to pull all of this data into python. I realize this needs to be a chunked batch process and looped through, but I'm not sure how I can set the batches to start where the previous left off.
Is there some way to filter my scan? From what I read that filtering occurs after loading and the loading stops at 1mb so I wouldn't actually be able to scan in new objects.
Any assistance would be appreciated.
import boto3
dynamodb = boto3.resource('dynamodb',
aws_session_token = aws_session_token,
aws_access_key_id = aws_access_key_id,
aws_secret_access_key = aws_secret_access_key,
region_name = region
)
table = dynamodb.Table('widgetsTableName')
data = table.scan()
I think the Amazon DynamoDB documentation regarding table scanning answers your question.
In short, you'll need to check for LastEvaluatedKey in the response. Here is an example using your code:
import boto3
dynamodb = boto3.resource('dynamodb',
aws_session_token=aws_session_token,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region
)
table = dynamodb.Table('widgetsTableName')
response = table.scan()
data = response['Items']
while 'LastEvaluatedKey' in response:
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
data.extend(response['Items'])
DynamoDB limits the scan method to 1mb of data per scan.
Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Client.scan
Here is an example loop to get all the data from a DynamoDB table using LastEvaluatedKey:
import boto3
client = boto3.client('dynamodb')
def dump_table(table_name):
results = []
last_evaluated_key = None
while True:
if last_evaluated_key:
response = client.scan(
TableName=table_name,
ExclusiveStartKey=last_evaluated_key
)
else:
response = client.scan(TableName=table_name)
last_evaluated_key = response.get('LastEvaluatedKey')
results.extend(response['Items'])
if not last_evaluated_key:
break
return results
# Usage
data = dump_table('your-table-name')
# do something with data
boto3 offers paginators that handle all the pagination details for you. Here is the doc page for the scan paginator. Basically, you would use it like so:
import boto3
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
for page in paginator.paginate():
# do something
Riffing off of Jordon Phillips's answer, here's how you'd pass a FilterExpression in with the pagination:
import boto3
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
operation_parameters = {
'TableName': 'foo',
'FilterExpression': 'bar > :x AND bar < :y',
'ExpressionAttributeValues': {
':x': {'S': '2017-01-31T01:35'},
':y': {'S': '2017-01-31T02:08'},
}
}
page_iterator = paginator.paginate(**operation_parameters)
for page in page_iterator:
# do something
Code for deleting dynamodb format type as #kungphu mentioned.
import boto3
from boto3.dynamodb.types import TypeDeserializer
from boto3.dynamodb.transform import TransformationInjector
client = boto3.client('dynamodb')
paginator = client.get_paginator('query')
service_model = client._service_model.operation_model('Query')
trans = TransformationInjector(deserializer = TypeDeserializer())
for page in paginator.paginate():
trans.inject_attribute_value_output(page, service_model)
Turns out that Boto3 captures the "LastEvaluatedKey" as part of the returned response. This can be used as the start point for a scan:
data= table.scan(
ExclusiveStartKey=data['LastEvaluatedKey']
)
I plan on building a loop around this until the returned data is only the ExclusiveStartKey
The 2 approaches suggested above both have problems: Either writing lengthy and repetitive code that handles paging explicitly in a loop, or using Boto paginators with low-level sessions, and foregoing the advantages of higher-level Boto objects.
A solution using Python functional code to provide a high-level abstraction allows higher-level Boto methods to be used, while hiding the complexity of AWS paging:
import itertools
import typing
def iterate_result_pages(function_returning_response: typing.Callable, *args, **kwargs) -> typing.Generator:
"""A wrapper for functions using AWS paging, that returns a generator which yields a sequence of items for
every response
Args:
function_returning_response: A function (or callable), that returns an AWS response with 'Items' and optionally 'LastEvaluatedKey'
This could be a bound method of an object.
Returns:
A generator which yields the 'Items' field of the result for every response
"""
response = function_returning_response(*args, **kwargs)
yield response["Items"]
while "LastEvaluatedKey" in response:
kwargs["ExclusiveStartKey"] = response["LastEvaluatedKey"]
response = function_returning_response(*args, **kwargs)
yield response["Items"]
return
def iterate_paged_results(function_returning_response: typing.Callable, *args, **kwargs) -> typing.Iterator:
"""A wrapper for functions using AWS paging, that returns an iterator of all the items in the responses.
Items are yielded to the caller as soon as they are received.
Args:
function_returning_response: A function (or callable), that returns an AWS response with 'Items' and optionally 'LastEvaluatedKey'
This could be a bound method of an object.
Returns:
An iterator which yields one response item at a time
"""
return itertools.chain.from_iterable(iterate_result_pages(function_returning_response, *args, **kwargs))
# Example, assuming 'table' is a Boto DynamoDB table object:
all_items = list(iterate_paged_results(ProjectionExpression = 'my_field'))
I had some problems with Vincent's answer related to the transformation being applied to the LastEvaluatedKey and messing up the pagination. Solved as follows:
import boto3
from boto3.dynamodb.types import TypeDeserializer
from boto3.dynamodb.transform import TransformationInjector
client = boto3.client('dynamodb')
paginator = client.get_paginator('scan')
operation_model = client._service_model.operation_model('Scan')
trans = TransformationInjector(deserializer = TypeDeserializer())
operation_parameters = {
'TableName': 'tablename',
}
items = []
for page in paginator.paginate(**operation_parameters):
has_last_key = 'LastEvaluatedKey' in page
if has_last_key:
last_key = page['LastEvaluatedKey'].copy()
trans.inject_attribute_value_output(page, operation_model)
if has_last_key:
page['LastEvaluatedKey'] = last_key
items.extend(page['Items'])
If you are landing here looking for a paginated scan with some filtering expression(s):
def scan(table, **kwargs):
response = table.scan(**kwargs)
yield from response['Items']
while response.get('LastEvaluatedKey'):
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
yield from response['Items']
Example usage:
table = boto3.Session(...).resource('dynamodb').Table('widgetsTableName')
items = list(scan(table, FilterExpression=Attr('name').contains('foo')))
I can't work out why Boto3 provides high-level resource abstraction but doesn't provide pagination. When it does provide pagination, it's hard to use!
The other answers to this question were good but I wanted a super simple way to wrap the boto3 methods and provide memory-efficient paging using generators:
import typing
import boto3
import boto3.dynamodb.conditions
def paginate_dynamodb_response(dynamodb_action: typing.Callable, **kwargs) -> typing.Generator[dict, None, None]:
# Using the syntax from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/dynamodb/GettingStarted/scenario_getting_started_movies.py
keywords = kwargs
done = False
start_key = None
while not done:
if start_key:
keywords['ExclusiveStartKey'] = start_key
response = dynamodb_action(**keywords)
start_key = response.get('LastEvaluatedKey', None)
done = start_key is None
for item in response.get("Items", []):
yield item
## Usage ##
dynamodb_res = boto3.resource('dynamodb')
dynamodb_table = dynamodb_res.Table('my-table')
query = paginate_dynamodb_response(
dynamodb_table.query, # The boto3 method. E.g. query or scan
# Regular Query or Scan parameters
#
# IndexName='myindex' # If required
KeyConditionExpression=boto3.dynamodb.conditions.Key('id').eq('1234')
)
for x in query:
print(x)```