Problem to use image to string on tesseract

Problem to use image to string on tesseract - python

Hello guys I have simple code in python using tesseract, but i thought this was a problem relationed with the version or something like this, take a look at the code:
from PIL import Image
import pytesseract
file = '/home/gxs/Downloads/a.png'
img = Image.open(file)
text = pytesseract.image_to_string(Image.open(file))
For this, i have the follow output (error):
TesseractError Traceback (most recent call last)
<ipython-input-1-65b8cbea5fe0> in <module>
4 img = Image.open(file)
5 #display(img)
----> 6 text = pytesseract.image_to_string(Image.open(file))
~/.local/lib/python3.8/site-packages/pytesseract/pytesseract.py in image_to_string(image, lang, config, nice, output_type, timeout)
368 args = [image, 'txt', lang, config, nice, timeout]
369
--> 370 return {
371 Output.BYTES: lambda: run_and_get_output(*(args + [True])),
372 Output.DICT: lambda: {'text': run_and_get_output(*args)},
~/.local/lib/python3.8/site-packages/pytesseract/pytesseract.py in <lambda>()
371 Output.BYTES: lambda: run_and_get_output(*(args + [True])),
372 Output.DICT: lambda: {'text': run_and_get_output(*args)},
--> 373 Output.STRING: lambda: run_and_get_output(*args),
374 }[output_type]()
375
~/.local/lib/python3.8/site-packages/pytesseract/pytesseract.py in run_and_get_output(image, extension, lang, config, nice, timeout, return_bytes)
280 }
281
--> 282 run_tesseract(**kwargs)
283 filename = kwargs['output_filename_base'] + extsep + extension
284 with open(filename, 'rb') as output_file:
~/.local/lib/python3.8/site-packages/pytesseract/pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
256 with timeout_manager(proc, timeout) as error_string:
257 if proc.returncode:
--> 258 raise TesseractError(proc.returncode, get_errors(error_string))
259
260
TesseractError: (-11, 'Tesseract Open Source OCR Engine v3.03 with Leptonica actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES:Error:Assert failed:in file tessdatamanager.cpp, line 53')
PS: My distro is arch linux, i have try to uninstall with all dependencies but not work at all.

Related

Read Excel file from OneDrive / Sharepoint - error code: File is not a zip file

I am trying to read an excel file from SharePoint to python and my first error message was that I should define an engine manually, so I defined the engine = 'openpyxl' and now the following error message comes up: File is not a ZIP File
From the previous Q&As it was often talked about if the Excel - file is a real Excel file or some text file with a fake xlsx extension.
The excel file was created using Microsoft Excel and its stored in a shared OneDrive folder (Team - Sharepoint). Does it affect the error message?
How can i solve this?
Many thanks in advance!
My Code:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.files.file import File
url_sp = 'https://company.sharepoint.com/teams/TeamE'
username_sp = 'MyUsername'
password_sp = 'MyPassword'
folder_url_sp = '/Shared%20Documents/02%20Team%20IAP/06_Da-An/Data/E/Edate.xlsx'
#Authentication
ctx_auth = AuthenticationContext(url_sp)
if ctx_auth.acquire_token_for_user(username_sp, password_sp):
ctx = ClientContext(url_sp, ctx_auth)
web = ctx.web
ctx.load(web)
ctx.execute_query()
print('Authentication sucessfull')
else:
print(ctx_auth.get_last_error())
import io
response = File.open_binary(ctx,folder_url_sp)
bytes_file_obj = io.BytesIO()
bytes_file_obj.write(response.content)
bytes_file_obj.seek(0)
data = pd.read_excel(bytes_file_obj,sheet_name = None, engine = 'openpyxl')
The Error:
BadZipFile Traceback (most recent call last)
Cell In[29], line 32
29 bytes_file_obj.write(response.content)
30 bytes_file_obj.seek(0)
---> 32 data = pd.read_excel(bytes_file_obj, sheet_name= None, engine = 'openpyxl')
File ~\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~\Anaconda3\lib\site-packages\pandas\io\excel\_base.py:457, in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, decimal, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
455 if not isinstance(io, ExcelFile):
456 should_close = True
--> 457 io = ExcelFile(io, storage_options=storage_options, engine=engine)
458 elif engine and engine != io.engine:
459 raise ValueError(
460 "Engine should not be specified when passing "
461 "an ExcelFile - ExcelFile already has the engine set"
462 )
File ~\Anaconda3\lib\site-packages\pandas\io\excel\_base.py:1419, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
1416 self.engine = engine
1417 self.storage_options = storage_options
-> 1419 self._reader = self._engines[engine](self._io, storage_options=storage_options)
File ~\Anaconda3\lib\site-packages\pandas\io\excel\_openpyxl.py:525, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
514 """
515 Reader using openpyxl engine.
516
(...)
522 passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``)
523 """
524 import_optional_dependency("openpyxl")
--> 525 super().__init__(filepath_or_buffer, storage_options=storage_options)
File ~\Anaconda3\lib\site-packages\pandas\io\excel\_base.py:518, in BaseExcelReader.__init__(self, filepath_or_buffer, storage_options)
516 self.handles.handle.seek(0)
517 try:
--> 518 self.book = self.load_workbook(self.handles.handle)
519 except Exception:
520 self.close()
File ~\Anaconda3\lib\site-packages\pandas\io\excel\_openpyxl.py:536, in OpenpyxlReader.load_workbook(self, filepath_or_buffer)
533 def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
534 from openpyxl import load_workbook
--> 536 return load_workbook(
537 filepath_or_buffer, read_only=True, data_only=True, keep_links=False
538 )
File ~\Anaconda3\lib\site-packages\openpyxl\reader\excel.py:315, in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
288 def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA,
289 data_only=False, keep_links=True):
290 """Open the given filename and return the workbook
291
292 :param filename: the path to open or a file-like object
(...)
313
314 """
--> 315 reader = ExcelReader(filename, read_only, keep_vba,
316 data_only, keep_links)
317 reader.read()
318 return reader.wb
File ~\Anaconda3\lib\site-packages\openpyxl\reader\excel.py:124, in ExcelReader.__init__(self, fn, read_only, keep_vba, data_only, keep_links)
122 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
123 data_only=False, keep_links=True):
--> 124 self.archive = _validate_archive(fn)
125 self.valid_files = self.archive.namelist()
126 self.read_only = read_only
File ~\Anaconda3\lib\site-packages\openpyxl\reader\excel.py:96, in _validate_archive(filename)
89 msg = ('openpyxl does not support %s file format, '
90 'please check you can open '
91 'it with Excel first. '
92 'Supported formats are: %s') % (file_format,
93 ','.join(SUPPORTED_FORMATS))
94 raise InvalidFileException(msg)
---> 96 archive = ZipFile(filename, 'r')
97 return archive
File ~\Anaconda3\lib\zipfile.py:1266, in ZipFile.__init__(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps)
1264 try:
1265 if mode == 'r':
-> 1266 self._RealGetContents()
1267 elif mode in ('w', 'x'):
1268 # set the modified flag so central directory gets written
1269 # even if no files are added to the archive
1270 self._didModify = True
File ~\Anaconda3\lib\zipfile.py:1333, in ZipFile._RealGetContents(self)
1331 raise BadZipFile("File is not a zip file")
1332 if not endrec:
-> 1333 raise BadZipFile("File is not a zip file")
1334 if self.debug > 1:
1335 print(endrec)
BadZipFile: File is not a zip file
```

ValueError: Invalid file path or buffer object type

I've been using mplsoccer library and statsbombpy libraries for a while now with success.
Recently, I've tried to use it again with this code (not fully reproducible due to it being behind a paid api).
!pip install mplsoccer
import pandas as pd
import requests
from mplsoccer.statsbomb import read_event
username = creds['user']
password = creds['passwd']
auth = requests.auth.HTTPBasicAuth(username, password)
URL = 'https://data.statsbombservices.com/api/v5/events/18241'
response = requests.get(URL, auth = auth)
df_dict = read_event(response)
and I'm now starting to get the ValueError of invalid file path or buffer type. I contacted the owner of mplsoccer and asked him about it, and he said it wasn't a reproducible error for him, but it looks like my pandas is having trouble reading it.
response is returning exactly what it should be, it just fails with the below error code
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-64-9be43ec4f519> in <module>
6 URL = 'https://data.statsbombservices.com/api/v5/events/7430'
7 response = requests.get(URL, auth=auth)
----> 8 df_dict = read_event(response)
E:\py\lib\site-packages\mplsoccer\statsbomb.py in read_event(path_or_buf, related_event_df, shot_freeze_frame_df, tactics_lineup_df, warn)
120 match_id = int(path_or_buf.url.split('/')[-1].split('.')[0])
121 else:
--> 122 df = pd.read_json(path_or_buf, encoding='utf-8')
123 match_id = int(os.path.basename(path_or_buf)[:-5])
124
E:\py\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
205 else:
206 kwargs[new_arg_name] = new_arg_value
--> 207 return func(*args, **kwargs)
208
209 return cast(F, wrapper)
E:\py\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
E:\py\lib\site-packages\pandas\io\json\_json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, encoding_errors, lines, chunksize, compression, nrows, storage_options)
588 convert_axes = True
589
--> 590 json_reader = JsonReader(
591 path_or_buf,
592 orient=orient,
E:\py\lib\site-packages\pandas\io\json\_json.py in __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression, nrows, storage_options, encoding_errors)
673 raise ValueError("nrows can only be passed if lines=True")
674
--> 675 data = self._get_data_from_filepath(filepath_or_buffer)
676 self.data = self._preprocess_data(data)
677
E:\py\lib\site-packages\pandas\io\json\_json.py in _get_data_from_filepath(self, filepath_or_buffer)
710 or file_exists(filepath_or_buffer)
711 ):
--> 712 self.handles = get_handle(
713 filepath_or_buffer,
714 "r",
E:\py\lib\site-packages\pandas\io\common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
606
607 # open URLs
--> 608 ioargs = _get_filepath_or_buffer(
609 path_or_buf,
610 encoding=encoding,
E:\py\lib\site-packages\pandas\io\common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
393 if not is_file_like(filepath_or_buffer):
394 msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
--> 395 raise ValueError(msg)
396
397 return IOArgs(
ValueError: Invalid file path or buffer object type: <class 'requests_cache.models.response.CachedResponse'>
hoping someone can help me see exactly where pandas is struggling and what I can do to fix it? Thanks

OSError while calling Detectron2LayoutModel

After successfully installing Layout Parser in Windows, getting the below OS Error.
Code Used:
model = lp.Detectron2LayoutModel(config_path="lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
Using layout parser, trying to extract the content from image. But when I try to load models in Layout parser, it fails with the below error
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_178664\3254664512.py in <module>
1 model = lp.Detectron2LayoutModel(config_path="lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
2 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
----> 3 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
4 # Load the deep layout model from the layoutparser API
5 # For all the supported model, please check the Model
~\Anaconda3\envs\layout\lib\site-packages\layoutparser\models\detectron2\layoutmodel.py in __init__(self, config_path, model_path, label_map, extra_config, enforce_cpu, device)
89 config_path, model_path, allow_empty_path=True
90 )
---> 91 config_path = PathManager.get_local_path(config_path)
92
93 if label_map is None:
~\Anaconda3\envs\layout\lib\site-packages\iopath\common\file_io.py in get_local_path(self, path, force, **kwargs)
1195 handler = self.__get_path_handler(path) # type: ignore
1196 try:
-> 1197 bret = handler._get_local_path(path, force=force, **kwargs)
1198 except TypeError:
1199 bret = handler._get_local_path(path, **kwargs)
~\Anaconda3\envs\layout\lib\site-packages\layoutparser\models\detectron2\catalog.py in _get_local_path(self, path, **kwargs)
134 else:
135 raise ValueError(f"Unknown data_type {data_type}")
--> 136 return PathManager.get_local_path(model_url, **kwargs)
137
138 def _open(self, path, mode="r", **kwargs):
~\Anaconda3\envs\layout\lib\site-packages\iopath\common\file_io.py in get_local_path(self, path, force, **kwargs)
1195 handler = self.__get_path_handler(path) # type: ignore
1196 try:
-> 1197 bret = handler._get_local_path(path, force=force, **kwargs)
1198 except TypeError:
1199 bret = handler._get_local_path(path, **kwargs)
~\Anaconda3\envs\layout\lib\site-packages\iopath\common\file_io.py in _get_local_path(self, path, force, cache_dir, **kwargs)
792
793 cached = os.path.join(dirname, filename)
--> 794 with file_lock(cached):
795 if not os.path.isfile(cached):
796 logger.info("Downloading {} ...".format(path))
~\Anaconda3\envs\layout\lib\site-packages\portalocker\utils.py in __enter__(self)
155
156 def __enter__(self):
--> 157 return self.acquire()
158
159 def __exit__(self,
~\Anaconda3\envs\layout\lib\site-packages\portalocker\utils.py in acquire(self, timeout, check_interval, fail_when_locked)
237
238 # Get a new filehandler
--> 239 fh = self._get_fh()
240
241 def try_close(): # pragma: no cover
~\Anaconda3\envs\layout\lib\site-packages\portalocker\utils.py in _get_fh(self)
287 def _get_fh(self) -> typing.IO:
288 '''Get a new filehandle'''
--> 289 return open(self.filename, self.mode, **self.file_open_kwargs)
290
291 def _get_lock(self, fh: typing.IO) -> typing.IO:
OSError: [Errno 22] Invalid argument: 'C:\\Users\\vchinna/.torch/iopath_cache\\s/nau5ut6zgthunil\\config.yaml?dl=1.lock'
Not sure whether it is a kind of lock or something.
Please help

Even I got a similar error. I tried out manually some work around in Windows.
I am using your case as example: OSError: [Errno 22] Invalid argument: 'C:\Users\vchinna/.torch/iopath_cache\s/nau5ut6zgthunil\config.yaml?dl=1.lock'
Please follow the following process.
Navigate to C:\Users\vchinna/.torch/iopath_cache\s/nau5ut6zgthunil\config.yaml
Open that config.yaml file
Scroll down to WEIGHTS: https://www.dropbox.com/s/h7th27jfv19rxiy/model_final.pth?dl=1 should be around 265 line.
Copy that link and paste it in your browser, a 'model_final.pth' will be downloaded. Copy this file to your desired folder.
Now replace the path to WEIGHTS: your_desired_folder/model_final.pth
Save it and run the code it works!
But there is a small work around I think before you do this (if you have not done)
[iopath work around][1]
https://github.com/Layout-Parser/layout-parser/issues/15 (Github link to the issue)

Why can't python/jupyer notebook find the text file? Where should it be saved?

I'm trying to load a text file as an array in python by entering this code:
from numpy import loadtxt
values = loadtxt("values.txt", float)
mean = sum(values)/len(values)
print(mean)
but when I run the program I get:
OSError Traceback (most recent call last)
<ipython-input-10-4b9a39f8b17f> in <module>
1 from numpy import loadtxt
----> 2 values = loadtxt("values.txt", float)
3 mean = sum(values)/len(values)
4 print(mean)
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows)
960 fname = os_fspath(fname)
961 if _is_string_like(fname):
--> 962 fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
963 fencoding = getattr(fh, 'encoding', 'latin1')
964 fh = iter(fh)
~\Anaconda3\lib\site-packages\numpy\lib\_datasource.py in open(path, mode, destpath, encoding, newline)
264
265 ds = DataSource(destpath)
--> 266 return ds.open(path, mode, encoding=encoding, newline=newline)
267
268
~\Anaconda3\lib\site-packages\numpy\lib\_datasource.py in open(self, path, mode, encoding, newline)
622 encoding=encoding, newline=newline)
623 else:
--> 624 raise IOError("%s not found." % path)
625
626
OSError: values.txt not found.
I have the values.txt file saved in my documents folder. Do I need to save it in some specific folder so Python can find it?

You can either use the absolute path, or use loadtxt("values.txt", float) but then your file should be in the same folder with your script/jupyter.

Python -- How to rename Musescore path in package music21?

I tried to install the Python package music21 and am having a problem running it in Windows. Basically, when I tried to run the simple command they give as an example
converter.parse("tinynotation: 3/4 c4 d8 f g16 a g f#").show()
I got an error
SubConverterException: Cannot find a path to the 'mscore' file at C:\Program Files (x86)\MuseScore 2\MuseScore.exe -- download MuseScore
The reason for this is because Musescore.exe is no longer stored in the folder "MuseScore 2" but now in a subfolder called "bin". So the path needs to be set to be "C:\Program Files (x86)\MuseScore 2\bin\MuseScore.exe" in order to access Musescore.
How do I change this?
Full Error
SubConverterException Traceback (most recent call last)
<ipython-input-8-46c66c71749d> in <module>()
----> 1 converter.parse("tinynotation: 3/4 c4 d8 f g16 a g f#").show()
C:\Users\MrNoName\Anaconda3\lib\site-packages\music21\stream\__init__.py in show(self, *args, **kwargs)
255 if self.isSorted is False and self.autoSort:
256 self.sort()
--> 257 return super(Stream, self).show(*args, **kwargs)
258
259 #---------------------------------------------------------------------------
C:\Users\MrNoName\Anaconda3\lib\site-packages\music21\base.py in show(self, fmt, app, **keywords)
2586 app=app,
2587 subformats=subformats,
-> 2588 **keywords)
2589
2590 #--------------------------------------------------------------------------
C:\Users\MrNoName\Anaconda3\lib\site-packages\music21\converter\subConverters.py in show(self, obj, fmt, app, subformats, **keywords)
312
313 if 'Opus' not in obj.classes:
--> 314 fp = helperSubConverter.write(obj, helperFormat, subformats=helperSubformats)
315
316 defaults.title = savedDefaultTitle
C:\Users\MrNoName\Anaconda3\lib\site-packages\music21\converter\subConverters.py in write(self, obj, fmt, fp, subformats, **keywords)
808
809 if subformats is not None and 'png' in subformats:
--> 810 fp = self.runThroughMusescore(fp, **keywords)
811 return fp
812
C:\Users\MrNoName\Anaconda3\lib\site-packages\music21\converter\subConverters.py in runThroughMusescore(self, fp, **keywords)
756 raise SubConverterException(
757 "Cannot find a path to the 'mscore' file at " +
--> 758 "%s -- download MuseScore" % musescorePath)
759
760 fpOut = fp[0:len(fp) - 3]
SubConverterException: Cannot find a path to the 'mscore' file at C:\Program Files (x86)\MuseScore 2\MuseScore.exe -- download MuseScore

Do this right after importing music21:
environment.set('musescoreDirectPNGPath', 'C:\\Program Files (x86)\\MuseScore 2\\bin\\MuseScore.exe')

For MuseScore 3
us = environment.UserSettings()
us['musicxmlPath'] = 'C:\\Program Files\\MuseScore 3\\bin\\MuseScore3.exe'
us['musescoreDirectPNGPath'] = 'C:\\Program Files\\MuseScore 3\\bin\\MuseScore3.exe'
us['musicxmlPath']
And if it still does not work, try opening the environment.py with sublime or else in
C:\Users\YOU\AppData\Local\Programs\Python\Python39\Lib\site-packages\music21\environment.py
then change
'%PROGRAMFILES%\MuseScore 3\MuseScore.exe'
for
'%PROGRAMFILES%\MuseScore 3\bin\MuseScore.exe'

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Problem to use image to string on tesseract - python

Related

Read Excel file from OneDrive / Sharepoint - error code: File is not a zip file

ValueError: Invalid file path or buffer object type

OSError while calling Detectron2LayoutModel

Why can't python/jupyer notebook find the text file? Where should it be saved?

Python -- How to rename Musescore path in package music21?

Categories

Resources