Strange error adding to Whoosh index - python

Can anyone help me with this strange error I'm getting when adding a new document to a Whoosh index?
Here's the code:
def add_to_index(self, doc):
ix = index.open_dir(self.index_dir)
writer = AsyncWriter(ix) # use async writer to prevent write lock errors
writer.add_document(**self.get_doc_args(doc))
writer.commit()
def get_doc_args(self, doc):
return {
'id': u""+str(doc['id']),
'org': doc['org__id'],
'created': doc['created_date'],
'date': doc['received_date'],
'from_addr': doc['from_addr'],
'subject': doc['subject'],
'body': doc['messagebody__cleaned_message']
}
I get the following error:
TypeError('ord() expected a character, but string of length 0 found',)
Traceback (most recent call last):
File "/usr/local/lib/python2.6/dist-packages/celery/execute/trace.py", line 36, in trace
return cls(states.SUCCESS, retval=fun(*args, **kwargs))
File "/usr/local/lib/python2.6/dist-packages/celery/app/task/__init__.py", line 232, in __call__
return self.run(*args, **kwargs)
File "/usr/local/lib/python2.6/dist-packages/celery/app/__init__.py", line 172, in run
return fun(*args, **kwargs)
File "/mnt/deploy/prod/chorus/src/chorus/../chorus/search/__init__.py", line 131, in index_message
MessageSearcher().add_to_index(message)
File "/mnt/deploy/prod/chorus/src/chorus/../chorus/search/__init__.py", line 29, in add_to_index
writer.commit()
File "/usr/local/lib/python2.6/dist-packages/whoosh/writing.py", line 423, in commit
self.writer.commit(*args, **kwargs)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filewriting.py", line 501, in commit
new_segments = mergetype(self, self.segments)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filewriting.py", line 78, in MERGE_SMALL
reader = SegmentReader(writer.storage, writer.schema, seg)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filereading.py", line 63, in __init__
self.termsindex = TermIndexReader(tf)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filetables.py", line 590, in __init__
super(TermIndexReader, self).__init__(dbfile)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filetables.py", line 502, in __init__
OrderedHashReader.__init__(self, dbfile)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filetables.py", line 379, in __init__
HashReader.__init__(self, dbfile)
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/filetables.py", line 187, in __init__
self.hashtype = dbfile.read_byte()
File "/usr/local/lib/python2.6/dist-packages/whoosh/filedb/structfile.py", line 219, in read_byte
return ord(self.file.read(1))
Strangely, the exact same code using a standard writer (i.e. not AsyncWriter) works just fine. What am I missing here? Note that in production I have to use AsyncWriter in order to avoid LockErrors.

This error is caused by some kind of index corruption. In my case the machine crashed by another reason while index was being rebuild.
You can easily solve it by deleting whoosh_index folder contents completely and rebuilidng index.

Ended up finding a solution; it's called Solr :-)

Related

How to use multiprocessing in Python with Django to create thousands of model instances from xml file?

I have a script that parses data from an xml file into Django models. It works ok but recently we've been encountering files with dozen of thousands of models to create which is incredibly slowing down the process (up to 40 min to upload a single 20MB file).
I would like to try and use multiprocessing to speed it up as much as I can but I am new to this package. This is the current setup:
create_profiles_from_xml: is a function that loops through the data extracted from the xml, called xml_members, popping one element at a time (it has to be like that due to a relationship that can be created with the next element or not).
create_profile: is a just a function that handles the data to match with the model fields and then call the .create() method, that's why I omitted the details of this function here.
def create_profiles_from_xml(xml_members, device):
profiles = []
while len(xml_members) > 0:
parent_member = members.pop(0)
profile = create_profile(parent_member, parent=None)
if profile:
profiles.append(profile)
return profiles
And here is what I tried to do with the multipocessing
from multiprocessing import Pool
def create_profiles_from_xml(xml_members, device):
profiles = []
pool = Pool(processes=cpu_count())
profiles = pool.imap_unordered(create_profile, xml_members)
return profiles
And everytime I try to run it I get this error traceback:
Traceback (most recent call last):
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/backends/base/base.py", line 235, in _cursor
return self._prepare_cursor(self.create_cursor(name))
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/backends/postgresql/base.py", line 223, in create_cursor
cursor = self.connection.cursor()
psycopg2.InterfaceError: connection already closed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/core/handlers/exception.py", line 34, in inner
response = get_response(request)
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/axes/middleware.py", line 49, in __call__
failures_since_start = AxesProxyHandler.get_failures(request, credentials)
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/axes/handlers/proxy.py", line 94, in get_failures
return cls.get_implementation().get_failures(request, credentials)
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/axes/handlers/database.py", line 75, in get_failures
attempt_count = max(
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/axes/handlers/database.py", line 77, in <genexpr>
attempts.aggregate(Sum("failures_since_start"))[
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/models/query.py", line 379, in aggregate
return query.get_aggregation(self.db, kwargs)
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/models/sql/query.py", line 489, in get_aggregation
result = compiler.execute_sql(SINGLE)
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/models/sql/compiler.py", line 1140, in execute_sql
cursor = self.connection.cursor()
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/backends/base/base.py", line 256, in cursor
return self._cursor()
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/backends/base/base.py", line 235, in _cursor
return self._prepare_cursor(self.create_cursor(name))
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/utils.py", line 89, in __exit__
raise dj_exc_value.with_traceback(traceback) from exc_value
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/backends/base/base.py", line 235, in _cursor
return self._prepare_cursor(self.create_cursor(name))
File "/home/user/.virtualenvs/omnio/lib/python3.8/site-packages/django/db/backends/postgresql/base.py", line 223, in create_cursor
cursor = self.connection.cursor()
django.db.utils.InterfaceError: connection already closed

_gdbm.error: Database needs recovery -- after running out of storage while fetching api data

I really don't know how to help myself, being unfamiliar with this kind of error, and not finding anything on the Google landscape really. My last hope is one of you guys since I don't know where else to go with this. I tried reinstalling all libraries and setting up a new venv. For more action I don't trust myself enough in these kinds of things.
The code triggering the error:
from wetterdienst import DWDObservationData
observations_daily = DWDObservationData(
station_ids=station_ids_d,
parameter=params_daily,
time_resolution=TimeResolution.DAILY,
start_date="2015-01-01",
end_date="2020-10-10",
tidy_data=True,
humanize_column_names=True,
)
for df in observations_hourly.collect_data():
name = str(df.STATION_ID.iloc[0]).strip(".0")
df.to_csv('./data/hourly/{}.csv'.format(name))
print('{} done'.format(name))
API is found here: https://github.com/earthobservations/wetterdienst
Error:
Traceback (most recent call last):
File "/Users/sashakaun/PycharmProjects/wetter2.0/main.py", line 83, in <module>
for df in observations_hourly.collect_data():
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/wetterdienst/dwd/observations/api.py", line 178, in collect_data
df_parameter = self._collect_parameter_from_station(
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/wetterdienst/dwd/observations/api.py", line 243, in _collect_parameter_from_station
df_period = collect_climate_observations_data(
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/wetterdienst/dwd/observations/access.py", line 82, in collect_climate_observations_data
filenames_and_files = download_climate_observations_data_parallel(remote_files)
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/wetterdienst/dwd/observations/access.py", line 106, in download_climate_observations_data_parallel
return list(zip(remote_files, files_in_bytes))
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/concurrent/futures/_base.py", line 611, in result_iterator
yield fs.pop().result()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/concurrent/futures/_base.py", line 432, in result
return self.__get_result()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/concurrent/futures/_base.py", line 388, in __get_result
raise self._exception
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/wetterdienst/dwd/observations/access.py", line 124, in _download_climate_observations_data
return BytesIO(__download_climate_observations_data(remote_file=remote_file))
File "<decorator-gen-2>", line 2, in __download_climate_observations_data
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/cache/region.py", line 1356, in get_or_create_for_user_func
return self.get_or_create(
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/cache/region.py", line 954, in get_or_create
with Lock(
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/lock.py", line 185, in __enter__
return self._enter()
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/lock.py", line 94, in _enter
generated = self._enter_create(value, createdtime)
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/lock.py", line 178, in _enter_create
return self.creator()
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/cache/region.py", line 920, in gen_value
self.backend.set(key, value)
File "/Users/sashakaun/PycharmProjects/wetter2.0/venv/lib/python3.8/site-packages/dogpile/cache/backends/file.py", line 239, in set
dbm[key] = pickle.dumps(value, pickle.HIGHEST_PROTOCOL)
_gdbm.error: Database needs recovery
Thanks a lot!!
A GDBM file has been corrupted. You need to use gdbmtool to recover the database. Install gdbmtool then run
gdbmtool FILENAME
Where FILENAME is the name of the GDBM database. A prompt will appear, then you can enter
gdbmtool> recover summary
If the database can be recovered it will display a summary of the recovery results, eg:
Recovery succeeded.
Keys recovered: 6870650, failed: 5, duplicate: 0
Buckets recovered: 64830, failed: 2

msrest.exceptions.ValidationError: Parameter 'resource_group_name' must conform to the following pattern: '^[-\\w\\._\\(\\)]+$'

sathish#Azure:~/quickstart/python-docs-hello-world$ az webapp up -n appname
The command failed with an unexpected error. Here is the traceback:
Parameter 'resource_group_name' must conform to the following pattern: '^[-\\w\\._\\(\\)]+$'.
Traceback (most recent call last):
File "/opt/az/lib/python3.6/site-packages/knack/cli.py", line 206, in invoke
cmd_result = self.invocation.execute(args)
File "/opt/az/lib/python3.6/site-packages/azure/cli/core/commands/__init__.py", line 326, in execute
raise ex
File "/opt/az/lib/python3.6/site-packages/azure/cli/core/commands/__init__.py", line 384, in _run_jobs_serially
results.append(self._run_job(expanded_arg, cmd_copy))
File "/opt/az/lib/python3.6/site-packages/azure/cli/core/commands/__init__.py", line 375, in _run_job
cmd_copy.exception_handler(ex)
File "/opt/az/lib/python3.6/site-packages/azure/cli/command_modules/appservice/commands.py", line 54, in _polish_bad_errors
raise ex
File "/opt/az/lib/python3.6/site-packages/azure/cli/core/commands/__init__.py", line 354, in _run_job
result = cmd_copy(params)
File "/opt/az/lib/python3.6/site-packages/azure/cli/core/commands/__init__.py", line 145, in __call__
return self.handler(*args, **kwargs)
File "/opt/az/lib/python3.6/site-packages/azure/cli/core/__init__.py", line 451, in default_command_handler
return op(**command_args)
File "/opt/az/lib/python3.6/site-packages/azure/cli/command_modules/appservice/custom.py", line 2313, in webapp_up
_create_new_rg = should_create_new_rg(cmd, default_rg, rg_name, is_linux)
File "/opt/az/lib/python3.6/site-packages/azure/cli/command_modules/appservice/_create_util.py", line 282, in should_create_new_rg
elif (_check_resource_group_exists(cmd, rg_name) and
File "/opt/az/lib/python3.6/site-packages/azure/cli/command_modules/appservice/_create_util.py", line 86, in _check_resource_group_exists
return rcf.resource_groups.check_existence(rg_name)
File "/opt/az/lib/python3.6/site-packages/azure/mgmt/resource/resources/v2018_05_01/operations/resource_groups_operations.py", line 61, in check_existence
'resourceGroupName': self._serialize.url("resource_group_name", resource_group_name, 'str', max_length=90, min_length=1, pattern=r'^[-\w\._\(\)]+$'),
File "/opt/az/lib/python3.6/site-packages/msrest/serialization.py", line 592, in url
data = self.validate(data, name, required=True, **kwargs)
File "/opt/az/lib/python3.6/site-packages/msrest/serialization.py", line 672, in validate
raise ValidationError(key, name, value)
msrest.exceptions.ValidationError: Parameter 'resource_group_name' must conform to the following pattern: '^[-\\w\\._\\(\\)]+$'.
To open an issue, please run: 'az feedback'
I had this problem, but resolved it by removing the quotes around the resource group name. Silly mistake on my behalf, but may be the issue here.
e.g. use az group create -n my-groupname-rg -l northeurope
rather than: e.g. use az group create -n 'my-groupname-rg' -l 'northeurope'
For resource group name, you should take a look at Naming rules and restrictions:

I wrote a project with tornado, but this exception is always in my log file

This is the error log:
[I 160308 11:09:59 web:1908] 200 GET /admin/realtime (117.93.180.216) 107.13ms
[E 160308 11:09:59 http1connection:54] Uncaught exception
Traceback (most recent call last):
File "/usr/local/lib/python3.4/dist-packages/tornado/http1connection.py", line 238, in _read_message
delegate.finish()
File "/usr/local/lib/python3.4/dist-packages/tornado/httpserver.py", line 290, in finish
self.delegate.finish()
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 1984, in finish
self.execute()
File "/usr/local/lib/python3.4/dist-packages/blueware-1.0.10/blueware/hooks/framework_tornado/web.py", line 480, in _bw_wrapper__RequestDispatcher_execute
future = wrapped(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 2004, in execute
**self.handler_kwargs)
File "/usr/local/lib/python3.4/dist-packages/blueware-1.0.10/blueware/hooks/framework_tornado/web.py", line 448, in _bw_wrapper_RequestHandler___init___
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 185, in init
self.initialize(**kwargs)
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 2714, in wrapper
self.redirect(url)
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 671, in redirect
self.finish()
File "/usr/local/lib/python3.4/dist-packages/blueware-1.0.10/blueware/hooks/framework_tornado/web.py", line 309, in _bw_wrapper_RequestHandler_finish_
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 934, in finish
self.flush(include_footers=True)
File "/usr/local/lib/python3.4/dist-packages/tornado/web.py", line 870, in flush
for transform in self._transforms:
TypeError: 'NoneType' object is not iterable
[I 160308 11:10:00 web:1908] 200 GET /admin/order?order_type=1&order_status=1&page=0&action=allreal (49.89.27.173) 134.53ms
Can anyone tell me how to solve this problem? Thank you very much
I assume that OneAPM (blueware agent) is compatible with your python and Tornado version, however it's can be tricky.
Solution
Move self.redirect(url) from your handler initialize method to get method, like this
class MyHandler(tornado.web.RequestHandler):
def get(self):
self.redirect('/some_url')
or use RedirectHandler.
Every action that could finish request needs to be called in context of http-verb method (get, post, put and so on). The common mistake is making authetication/authorization in __init__ or initialize.
More detail
In Tornado's source there is a note about _transforms that is initialized in the constructor with None and set in_execute (oversimplifying - after headers_received).
A transform modifies the result of an HTTP request (e.g., GZip encoding).
Applications are not expected to create their own OutputTransforms
or interact with them directly; the framework chooses which transforms
(if any) to apply.
Reproduce
Sample that triggers this error. I'm including this only as a cross-check that blueware is not the cause:
import tornado.ioloop
import tornado.web
class SomeHandler(tornado.web.RequestHandler):
def initialize(self, *args, **kwargs):
url = '/some'
self.redirect(url)
# ^ this is wrong
def get(self):
# redirect should be here
self.write("Hello")
def make_app():
return tornado.web.Application([
(r"/", SomeHandler),
])
if __name__ == "__main__":
app = make_app()
app.listen(8888)
tornado.ioloop.IOLoop.current().start()
And stacktrace:
ERROR:tornado.application:Uncaught exception
Traceback (most recent call last):
File "/tmp/py3/lib/python3.4/site-packages/tornado/http1connection.py", line 238, in _read_message
delegate.finish()
File "/tmp/py3/lib/python3.4/site-packages/tornado/httpserver.py", line 289, in finish
self.delegate.finish()
File "/tmp/py3/lib/python3.4/site-packages/tornado/web.py", line 2022, in finish
self.execute()
File "/tmp/py3/lib/python3.4/site-packages/tornado/web.py", line 2042, in execute
**self.handler_kwargs)
File "/tmp/py3/lib/python3.4/site-packages/tornado/web.py", line 183, in __init__
self.initialize(**kwargs)
File "test.py", line 8, in initialize
self.redirect(url)
File "/tmp/py3/lib/python3.4/site-packages/tornado/web.py", line 666, in redirect
self.finish()
File "/tmp/py3/lib/python3.4/site-packages/tornado/web.py", line 932, in finish
self.flush(include_footers=True)
File "/tmp/py3/lib/python3.4/site-packages/tornado/web.py", line 868, in flush
for transform in self._transforms:
TypeError: 'NoneType' object is not iterable

Time out error while creating cgi.FieldStorage object

Hey, any idea about what is the timeout error which I am getting here:
Error trace:
File "/array/purato/python2.6/lib/python2.6/site-packages/cherrypy/_cprequest.py", line 606, in respond
cherrypy.response.body = self.handler()
File "/array/purato/python2.6/lib/python2.6/site-packages/cherrypy/_cpdispatch.py", line 25, in __call__
return self.callable(*self.args, **self.kwargs)
File "sync_server.py", line 853, in put_file
return RequestController_v1_0.put_file(self, *args, **kw)
File "sync_server.py", line 409, in put_file
saved_path, tgt_path, root_folder = self._save_file(client_id, theFile)
File "sync_server.py", line 404, in _save_file
saved_path, tgt_path, root_folder = get_posted_file(cherrypy.request, 'theFile', staging_path)
File "sync_server.py", line 1031, in get_posted_file
, keep_blank_values=True)
File "/array/purato/python2.6/lib/python2.6/cgi.py", line 496, in __init__
self.read_multi(environ, keep_blank_values, strict_parsing)
File "/array/purato/python2.6/lib/python2.6/cgi.py", line 620, in read_multi
environ, keep_blank_values, strict_parsing)
File "/array/purato/python2.6/lib/python2.6/cgi.py", line 498, in __init__
self.read_single()
File "/array/purato/python2.6/lib/python2.6/cgi.py", line 635, in read_single
self.read_lines()
File "/array/purato/python2.6/lib/python2.6/cgi.py", line 657, in read_lines
self.read_lines_to_outerboundary()
File "/array/purato/python2.6/lib/python2.6/cgi.py", line 685, in read_lines_to_outerboundary
line = self.fp.readline(1<<16)
File "/array/purato/python2.6/lib/python2.6/site-packages/cherrypy/wsgiserver/__init__.py", line 206, in readline
data = self.rfile.readline(size)
File "/array/purato/python2.6/lib/python2.6/site-packages/cherrypy/wsgiserver/__init__.py", line 868, in readline
data = self.recv(self._rbufsize)
File "/array/purato/python2.6/lib/python2.6/site-packages/cherrypy/wsgiserver/__init__.py", line 747, in recv
return self._sock.recv(size)
timeout: timed out
Here is the code which is getting called:
def get_posted_file(request, form_field_name, tgt_folder, tgt_fname=None):
logger.debug('get_posted_file: %s' % request.headers['Last-Modified'])
lowerHeaderMap = {}
for key, value in request.headers.items():
lowerHeaderMap[key.lower()] = value
---> dataDict = TmpFieldStorage(fp=request.rfile, headers=lowerHeaderMap, environ={'REQUEST_METHOD':'POST'}
, keep_blank_values=True)
and:
class TmpFieldStorage(cgi.FieldStorage):
"""
Use a named temporary file to allow creation of hard link to final destination
"""
def make_file(self, binary=None):
tmp_folder = os.path.join(get_filer_root(cherrypy.request.login), 'sync_tmp')
if not os.path.exists(tmp_folder):
os.makedirs(tmp_folder)
return tempfile.NamedTemporaryFile(dir=tmp_folder)
environ={'REQUEST_METHOD':'POST'}
That seems a rather deficient environ. The CGI spec requires many more environment variables to be in there, some of which the cgi module is going to need.
In particular there is no CONTENT_LENGTH header. Without it, cgi is defaulting to reading the entire contents of the stream up until EOF. But since it is (probably) a network stream rather than a file there will be no EOF (or at least not one directly at the end of the submission), so the form reader will be sitting there waiting for more input that will never come. Timeout.

Categories