I want to retrieve assignments from an old HIT on mturk using boto. This is what I did.
from boto.mturk.connection import MTurkConnection
import csv
mtc = MTurkConnection(aws_access_key_id=ACCESS_ID,
aws_secret_access_key=SECRET_KEY,
host=HOST)
assignments = mtc.get_assignments(HIT_ID, status=None, sort_by='SubmitTime', sort_direction='Ascending', page_size=10, page_number=1, response_groups=None)
with open('assignments.csv','w') as tar:
csvwriter = csv.writer(tar, delimiter=';', quoting = csv.QUOTE_NONE, quotechar='')
for asgn in assignments:
row = []
for i in range(len(asgn.answers[0])):
row.append(asgn.answers[0][i].fields[0])
csvwriter.writerow(row)
But it is throwing this error.
Traceback (most recent call last):
File "launch_task.py", line 13, in <module>
assignments = mtc.get_assignments(HIT_ID, status=None, sort_by='SubmitTime', sort_direction='Ascending', page_size=10, page_number=1, response_groups=None)
File "/home/projects/django_version/env/local/lib/python2.7/site-packages/boto/mturk/connection.py", line 417, in get_assignments
[('Assignment', Assignment)])
File "/home/projects/django_version/env/local/lib/python2.7/site-packages/boto/mturk/connection.py", line 838, in _process_request
return self._process_response(response, marker_elems)
File "/home/projects/django_version/env/local/lib/python2.7/site-packages/boto/mturk/connection.py", line 853, in _process_response
raise MTurkRequestError(response.status, response.reason, body)
boto.mturk.connection.MTurkRequestError: MTurkRequestError: 200 OK
<?xml version="1.0"?>
<GetAssignmentsForHITResponse><OperationRequest>
<RequestId>b006785d-4001-4835-adf6-2df34joW9588e</RequestId></OperationRequest>
<GetAssignmentsForHITResult><Request><IsValid>False</IsValid>
<Errors><Error><Code>AWS.MechanicalTurk.HITDoesNotExist</Code>
<Message>Hit 3MRHU529465LUERYI53175P072A8 does not exist. (1463449047025 s)</Message>
<Data><Key>HITId</Key><Value>3MRHU529465LUERYI53175P072A8</Value></Data>
<Data><Key>HITId</Key><Value>3MRHU529465LUERYI53175P072A8</Value></Data>
</Error></Errors>
</Request></GetAssignmentsForHITResult></GetAssignmentsForHITResponse>
I assure you the HIT does exist. I am using boto 2.40.0 .
Related
I am attempting to download either .pdfs or .xml files from two separate google drive folders. I can get the Python code to work in other folders that only contain the .pdfs and .xml files. The problem is that the production folders contain other files and folders that I do not want to download. Is there a way to only download files by extension type? If so, please help with Python in mind.
The reason I ask is because I have been unsuccessful querying by "'name contains 'blah'"
import google_drive.constants as c
import os
import io
import pandas as pd
from googleapiclient.http import MediaIoBaseDownload
service = Create_Service(c.CLIENT_SECRET_FILE, c.API_NAME, c.API_VERSION, c.SCOPES)
def compare_file_dates(df):
# Convert to datetime
df['createdTime'] = pd.to_datetime(df['createdTime'])
# Find the row with the latest createdTime timestamp
row = df[df.createdTime == df.createdTime.max()]
latest = row['id'][0]
print(latest)
return latest
def get_latest_file(folder_id):
query = f"parents = '{folder_id}'"
fields = "nextPageToken, files(id, name contains 'Quote' or name contains 'FGT', createdTime, mimeType)"
response = service.files().list(q=query, fields=fields).execute()
files = response.get('files')
next_page_token = response.get('nextPageToken')
while next_page_token:
response = service.files().list(q=query, fields=fields).execute()
files.extende(response.get('files'))
next_page_token = response.get('nextPageToken')
df = pd.DataFrame(files)
latest_file = compare_file_dates(df)
return latest_file
def download_files():
pdf_id = get_latest_file(c.PDF_FOLDER_ID)
xml_id = get_latest_file(c.XML_FOLDER_ID)
file_ids = [pdf_id, xml_id]
file_names = ['expense.pdf', 'sell.xml']
for file_id, file_name in zip(file_ids, file_names):
request = service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fd=fh, request=request)
done = False
while not done:
status, done = downloader.next_chunk()
print('Download progress {0}'.format(status.progress() * 100))
fh.seek(0)
with open(os.path.join('google_drive/downloads', file_name), 'wb') as f:
f.write(fh.read())
f.close()
download_files()
Here is the error:
googleapiclient.errors.HttpError: <HttpError 400 when requesting https://www.googleapis.com/drive/v3/files?q=parents+%3D+%271hMqacHtVlLOM1sD9cLsxm5jh4a7RHSGl%27&fields=nextPageToken%2C+files%28id%2C+name+contains+%27Quote%27+or+name+contains+%27FGT%27%2C+createdTime%2C+mimeType%29&alt=json returned "Invalid field selection nextPageToken, files(id, name contains 'Quote' or name contains 'FGT', createdTime, mimeType)". Details: "[{'domain': 'global', 'reason': 'invalidParameter', 'message': "Invalid field selection nextPageToken, files(id, name contains 'Quote' or name contains 'FGT', createdTime, mimeType)", 'locationType': 'parameter', 'location': 'fields'}]">
I believe the issue is figuring out the correct format or encoding for the "name contains 'Quote' or name contains 'FGT'".
The PDF file has names that start with the 'Quote' but end with random characters. Example would be 'Quote123456.pdf'. The XML file has a similar naming structure so 'FGT1236098.xml'. Do I need a wild card character on the "name contains 'Quote'"?
Is this the correct encoding for the above "name contains 'Quote' or name contains 'FGT'" search query? name+contains+%27Quote%27+or+name+contains+%27FGT%27%
new error
2022-06-19 06:21:59.658560 - Error: Traceback (most recent call last):
File "C:\Automations\GetMargins\venv\lib\site-packages\pandas\core\indexes\base.py", line 3621, in get_loc
return self._engine.get_loc(casted_key)
File "pandas_libs\index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas_libs\hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'createdTime'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Automations\GetMargins\main.py", line 12, in run
download_files() # Get the latest xml and pdf from Google Drive
File "C:\Automations\GetMargins\google_drive\google_drive_sub_task.py", line 42, in download_files
xml_id = get_latest_file(c.XML_FOLDER_ID)
File "C:\Automations\GetMargins\google_drive\google_drive_sub_task.py", line 35, in get_latest_file
latest_file = compare_file_dates(df)
File "C:\Automations\GetMargins\google_drive\google_drive_sub_task.py", line 13, in compare_file_dates
df['createdTime'] = pd.to_datetime(df['createdTime'])
File "C:\Automations\GetMargins\venv\lib\site-packages\pandas\core\frame.py", line 3505, in getitem
indexer = self.columns.get_loc(key)
File "C:\Automations\GetMargins\venv\lib\site-packages\pandas\core\indexes\base.py", line 3623, in get_loc
raise KeyError(key) from err
KeyError: 'createdTime'
When you are listing your files. You should search on the types of files you want.
Currently you have
query = f"parents = '{folder_id}'"
try adding
query = f"parents = '{folder_id}' and (mimeType='application/pdf' or mimeType='application/xml')"
Check search refrence
I set up a try catch in my code, but it appears that my exception was not correct because it did not seem to catch it.
I am using an exception from a module, and perhaps I didn't import it correctly? Here is my code:
import logging
import fhirclient.models.bundle as b
from fhirclient.server import FHIRUnauthorizedException
logging.disable(logging.WARNING)
def get_all_resources(resource, struct, smart):
'''Perform a search on a resource type and get all resources entries from all retunred bundles.\n
This function takes all paginated bundles into consideration.'''
if smart.ready == False:
smart.reauthorize
search = resource.where(struct)
bundle = search.perform(smart.server)
resources = [entry.resource for entry in bundle.entry or []]
next_url = _get_next_url(bundle.link)
while next_url != None:
try:
json_dict = smart.server.request_json(next_url)
except FHIRUnauthorizedException:
smart.reauthorize
continue
bundle = b.Bundle(json_dict)
resources += [entry.resource for entry in bundle.entry or []]
next_url = _get_next_url(bundle.link)
return resources
Now when i ran the code I got the following error:
Traceback (most recent call last):
File "code.py", line 79, in <module>
main()
File "code.py", line 42, in main
reports = get_all_resources(dr.DiagnosticReport, search, smart)
File "somepath/fhir_tools/resource.py", line 23, in get_all_resources
json_dict = smart.server.request_json(next_url)
File "/usr/local/lib/python3.6/dist-packages/fhirclient/server.py", line 153, in request_json
res = self._get(path, headers, nosign)
File "/usr/local/lib/python3.6/dist-packages/fhirclient/server.py", line 181, in _get
self.raise_for_status(res)
File "/usr/local/lib/python3.6/dist-packages/fhirclient/server.py", line 256, in raise_for_status
raise FHIRUnauthorizedException(response)
server.FHIRUnauthorizedException: <Response [401]>
Shouldn't my exception catch this?
I'm having trouble using the Python email module to parse emails where the FROM header has parentheses in it. This only seems to be the problem when using email.policy.default as opposed to email.policy.compat32.
Is there a solution to this problem, other than switching policies?
A minimum working example is below, for Python 3.6.5:
import email
import email.policy as email_policy
raw_mime_msg=b"from: James Mishra \\(says hi\\) <james#example.com>"
compat32_obj = email.message_from_bytes(
raw_mime_msg, policy=email_policy.compat32)
default_obj = email.message_from_bytes(
raw_mime_msg, policy=email_policy.default)
print(compat32_obj['from'])
print(default_obj['from'])
The first print statement returns:
James Mishra \(says hi\) <james#example.com>
and the second print statement returns:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1908, in get_address
token, value = get_group(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1867, in get_group
"display name but found '{}'".format(value))
email.errors.HeaderParseError: expected ':' at end of group display name but found '\(says hi\) <james#example.com>'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1734, in get_mailbox
token, value = get_name_addr(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1720, in get_name_addr
token, value = get_angle_addr(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1646, in get_angle_addr
"expected angle-addr but found '{}'".format(value))
email.errors.HeaderParseError: expected angle-addr but found '\(says hi\) <james#example.com>'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test_email.py", line 12, in <module>
print(default_obj['from'])
File "/usr/local/lib/python3.6/email/message.py", line 391, in __getitem__
return self.get(name)
File "/usr/local/lib/python3.6/email/message.py", line 471, in get
return self.policy.header_fetch_parse(k, v)
File "/usr/local/lib/python3.6/email/policy.py", line 162, in header_fetch_parse
return self.header_factory(name, value)
File "/usr/local/lib/python3.6/email/headerregistry.py", line 589, in __call__
return self[name](name, value)
File "/usr/local/lib/python3.6/email/headerregistry.py", line 197, in __new__
cls.parse(value, kwds)
File "/usr/local/lib/python3.6/email/headerregistry.py", line 340, in parse
kwds['parse_tree'] = address_list = cls.value_parser(value)
File "/usr/local/lib/python3.6/email/headerregistry.py", line 331, in value_parser
address_list, value = parser.get_address_list(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1931, in get_address_list
token, value = get_address(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1911, in get_address
token, value = get_mailbox(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1737, in get_mailbox
token, value = get_addr_spec(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1583, in get_addr_spec
token, value = get_local_part(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1413, in get_local_part
obs_local_part, value = get_obs_local_part(str(local_part) + value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1454, in get_obs_local_part
token, value = get_word(value)
File "/usr/local/lib/python3.6/email/_header_value_parser.py", line 1340, in get_word
if value[0]=='"':
IndexError: string index out of range
email.policy.default is intended to be compliant with the email RFCs, and your message is not compliant with RFC 5322. If the parenthesized part is supposed to be a comment, then the message should look like
raw_mime_msg=b"from: James Mishra (says hi) <james#example.com>"
to be compliant. If it is not supposed to be a comment, then the parentheses should appear inside a quoted string. That might look something like
raw_mime_msg=b'from: "James Mishra (says hi)" <james#example.com>'
Since your message is not compliant, using the policy that expects compliance is a poor fit. If you want to handle non-compliant messages, email.policy.compat32 is a better choice than email.policy.default.
I am trying to write a XML file but as I run ET.dump(root) I am getting:
<Reviews><Review rid="en_India'sGrill_477960693"><sentences><sentenceTraceback (most recent call last):
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 1078, in _escape_attrib
if "&" in text:
TypeError: argument of type 'Sentence' is not iterable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/media/Data/workspaces/git/master-thesis/python/thesis/semeval/slot1/nlp_cnn.py", line 421, in <module>
f.write(ET.dump(root))
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 1165, in dump
elem.write(sys.stdout, encoding="unicode")
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 775, in write
short_empty_elements=short_empty_elements)
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 940, in _serialize_xml
short_empty_elements=short_empty_elements)
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 940, in _serialize_xml
short_empty_elements=short_empty_elements)
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 940, in _serialize_xml
short_empty_elements=short_empty_elements)
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 932, in _serialize_xml
v = _escape_attrib(v)
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 1090, in _escape_attrib
_raise_serialization_error(text)
File "/usr/lib/python3.5/xml/etree/ElementTree.py", line 1056, in _raise_serialization_error
"cannot serialize %r (type %s)" % (text, type(text).__name__)
TypeError: cannot serialize <thesis.semeval_data.Sentence object at 0x7f9087f20400> (type Sentence)
No idea what the problem is. I'm constructing this in a very straight forward way:
root = ET.Element("Reviews")
i = 0
for k in review_dict:
review_element = ET.Element("Review")
root.append(review_element)
review_element.set("rid", k)
sentences_element = ET.Element("sentences")
review_element.append(sentences_element)
sentence_dict = review_dict[k]
for k in sentence_dict:
sentence_element = ET.Element("sentence")
sentence_element.set("id", k)
sentences_element.append(sentence_element)
sentence = sentence_dict[k]
text_element = ET.Element("text")
text_element.text = sentence.text
sentences_element.append(text_element)
opinions_element = ET.Element("Opinions")
sentence_element.append(opinions_element)
for category in aspect_categories[y_pred[i] > 0]:
opinion_element = ET.Element("Opinion")
opinion_element.set("category", category)
opinions_element.append(opinion_element)
i += 1
f = open(os.path.join(os.path.curdir, "..", "..", "pred.xml"), "w")
f.write(ET.dump(root)) # Exception
Any idea why I am getting this error? At first I thought it's because I didn't escape things but it appears that xml.etree.ElementTree does that already.
Just look at the traceback: _escape_attrib() raises a serialization error. Then refer to the source: ElementTree expects that element attribute value is a string and catches exceptions if it is not.
Check the type of values being used as <sentence> attributes: sentence_element.set("id", k). One of them is not a string.
When I call Create API from the python console, It gives following exception.
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
File "C:\Python27\lib\v1pysdk\base_asset.py", line 44, in create
return Class._v1_v1meta.create_asset(Class._v1_asset_type_name, newdata)
File "C:\Python27\lib\v1pysdk\v1meta.py", line 128, in create_asset
new_asset_xml = self.server.create_asset(asset_type_name, update_doc)
File "C:\Python27\lib\v1pysdk\client.py", line 202, in create_asset
return self.get_xml(path, query=query, postdata=body)
File "C:\Python27\lib\v1pysdk\client.py", line 159, in get_xml
document = ElementTree.fromstring(body)
File "C:\Python27\lib\xml\etree\ElementTree.py", line 1281, in XML
parser.feed(text)
File "C:\Python27\lib\xml\etree\ElementTree.py", line 1623, in feed
self._raiseerror(v)
File "C:\Python27\lib\xml\etree\ElementTree.py", line 1487, in _raiseerror
raise err
xml.etree.ElementTree.ParseError: reference to invalid character number: line 7575, column 75
I am running it with Python2.7 on windows.
This is the API I am calling
from v1pysdk import V1Meta
v1 = V1Meta(
address = 'www11.v1host.com',
instance = '<InstName>',
username = 'sbaid',
password = 'XXXXXX'
)
new_story = v1.Story.create(
Name = "Temp",
Scope = v1.Scope(321450)
)
v1.Scope(321450) returns the correct project name, that implies that session with version1 is established correctly.
These are the only two mandatory parameters and I am able to create the story with these two parameters using Web interface.
I am also able to create the story using following REST request
URL - https://www11.v1host.com/InstName/rest-1.v1/Data/Story
<Asset href="/<InstName>/rest-1.v1/New/Story">
<Attribute name="Name" act="set">Temp</Attribute>
<Relation name="Scope" act="set">
<Asset href="/<InstName>/rest-1.v1/Data/Scope/321450" idref="Scope:321450" />
</Relation>
</Asset>
There is an alternate way to specify the host address which is more reliable. Here's an example that you can try against the public VersionOne SDK testing instance:
from v1pysdk import V1Meta
with V1Meta (
instance_url = 'https://www14.v1host.com/v1sdktesting',
username = 'admin',
password = 'admin'
) as v1:
new_story = v1.Story.create(
Name = "Temp Test for StackOverflow question",
Scope = v1.Scope(0)
)
fetched_story = v1.Story.where(Number=new_story.Number).first()
print fetched_story.Name