Read shapefile from HDFS with geopandas - python

I have a shapefile on my HDFS and I would like to import it in my Jupyter Notebook with geopandas (version 0.8.1).
I tried the standard read_file() method but it does not recognize the HDFS directory; instead I believe it searches in my local directory, as I made a test with the local directory and reads the shapefile correctly.
This is the code I used:
import geopandas as gpd
shp = gpd.read_file('hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp')
and the error I obtained:
---------------------------------------------------------------------------
CPLE_OpenFailedError Traceback (most recent call last)
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
fiona/_err.pyx in fiona._err.exc_wrap_pointer()
CPLE_OpenFailedError: hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp: No such file or directory
During handling of the above exception, another exception occurred:
DriverError Traceback (most recent call last)
<ipython-input-17-3118e740e4a9> in <module>
----> 2 shp = gpd.read_file('hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp' class="ansi-blue-fg">)
3 print(shp.shape)
4 shp.head(3)
/opt/venv/geocoding/lib/python3.6/site-packages/geopandas/io/file.py in _read_file(filename, bbox, mask, rows, **kwargs)
94
95 with fiona_env():
---> 96 with reader(path_or_bytes, **kwargs) as features:
97
98 # In a future Fiona release the crs attribute of features will
/opt/venv/geocoding/lib/python3.6/site-packages/fiona/env.py in wrapper(*args, **kwargs)
398 def wrapper(*args, **kwargs):
399 if local._env:
--> 400 return f(*args, **kwargs)
401 else:
402 if isinstance(args[0], str):
/opt/venv/geocoding/lib/python3.6/site-packages/fiona/__init__.py in open(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)
255 if mode in ('a', 'r'):
256 c = Collection(path, mode, driver=driver, encoding=encoding,
--> 257 layer=layer, enabled_drivers=enabled_drivers, **kwargs)
258 elif mode == 'w':
259 if schema:
/opt/venv/geocoding/lib/python3.6/site-packages/fiona/collection.py in __init__(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)
160 if self.mode == 'r':
161 self.session = Session()
--> 162 self.session.start(self, **kwargs)
163 elif self.mode in ('a', 'w'):
164 self.session = WritingSession()
fiona/ogrext.pyx in fiona.ogrext.Session.start()
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
DriverError: hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp: No such file or directory
So, I was wondering whether it is actually possible to read a shapefile, stored in HDFS, with geopandas. If yes, how?

If someone is still looking for an answer to this question, I managed to find a workaround.
First of all, you need a .zip file which contains all the data related to your shapefile (.shp, .shx, .dbf, ...). Then, we use pyarrow to establish a connection to HDFS and fiona to read the zipped shapefile.
Package versions I'm using:
pyarrow==2.0.0
fiona==1.8.18
The code:
# import packages
import pandas as pd
import geopandas as gpd
import fiona
import pyarrow
# establish a connection to HDFS
fs = pyarrow.hdfs.connect()
# read zipped shapefile
with fiona.io.ZipMemoryFile(fs.open('hdfs://my_hdfs_directory/my_zipped_shapefile.zip')) as z:
with z.open('my_shp_file_within_zip.shp') as collection:
gdf = gpd.GeoDataFrame.from_features(collection)
print(gdf.shape)

Related

Geopandas pickle incompatibility between 2 versions

Package versions when I dump geo data frame obj by pickle.
scikit-image=0.17.2
scikit-learn=0.22.1
geopandas==0.7.0
pandas=1.0.3
Dump geo data frame by pickle
import pandas as pd
import pickle
pickle_file_path = "geo_data_frame.pkl" # file is attached
gdf.to_pickle(pickle_file_path) # gdf is geo data frame obj for example like gdf = gpd.GeoDataFrame(df, crs="epsg:4326")
Package versions when I read pickled file.
scikit-image=0.19.2
scikit-learn=1.0.2
geopandas==0.10.2
pandas=1.0.3
Read pickle
import pandas as pd
import pickle
pickle_file_path = "geo_data_frame.pkl" # file is attached
gdf = pd.read_pickle(pickle_file_path)
Traceback:
---> 58 gdf = pd.read_pickle(pickle_file_path)
59 shutil.rmtree(tempdir)
60 return pickle.dumps(gdf)
/opt/conda/envs/env/lib/python3.8/site-packages/pandas/io/pickle.py in read_pickle(filepath_or_buffer, compression)
180 # We want to silence any warnings about, e.g. moved modules.
181 warnings.simplefilter("ignore", Warning)
--> 182 return pickle.load(f)
183 except excs_to_catch:
184 # e.g.
/opt/conda/envs/env/lib/python3.8/site-packages/geopandas/array.py in __setstate__(self, state)
422 def __setstate__(self, state):
423 if compat.USE_PYGEOS:
--> 424 geoms = pygeos.from_wkb(state[0])
425 self._crs = state[1]
426 self._sindex = None # pygeos.STRtree could not be pickled yet
KeyError: 0
You can directly download pickle file in above code from https://drive.google.com/file/d/1VTDaapxsy6DosMmrSv9mUtVzNBn_ISeN/view?usp=sharing
Screenshot in sentry

Python lzma unable to load joblib

I have a scikit learn pipeline that I serialize using:
with lzma.open('outputs/baseModel_LR.joblib',"wb") as f:
dill.dump(pipeline, f)
When I try to open the file and load the pipeline using:
with lzma.open('outputs/baseModel_LR.joblib',"rb") as f:
model = dill.load(f)
it gives error:
---------------------------------------------------------------------------
EOFError Traceback (most recent call last)
somePath/notebooks/test.ipynb Cell 5 in <cell line: 1>()
1 with lzma.open('outputs/baseModel_LR.joblib',"rb") as f:
----> 2 model = dill.load(f)
3 model
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/dill/_dill.py:373, in load(file, ignore, **kwds)
367 def load(file, ignore=None, **kwds):
368 """
369 Unpickle an object from a file.
370
371 See :func:`loads` for keyword arguments.
372 """
--> 373 return Unpickler(file, ignore=ignore, **kwds).load()
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/dill/_dill.py:646, in Unpickler.load(self)
645 def load(self): #NOTE: if settings change, need to update attributes
--> 646 obj = StockUnpickler.load(self)
647 if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
648 if not self._ignore:
649 # point obj class to main
File /anaconda/envs/azureml_py38/lib/python3.8/lzma.py:200, in LZMAFile.read(self, size)
194 """Read up to size uncompressed bytes from the file.
...
100 "end-of-stream marker was reached")
101 else:
102 rawblock = b""
**EOFError: Compressed file ended before the end-of-stream marker was reached**
Has anyone faced this problem and solved it? I use lzma because otherwise the joblib size is 27GB and with lzma its just 20MB

How to fix unpickling key error when loading word2vec (gensim)?

I am trying to load a pre-trained word2vec model in pkl format taken from here
The line of code I use to load it:
model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl')
However, i keep getting the following error (full traceback):
UnpicklingError Traceback (most recent call last)
<ipython-input-15-ebd5780b6636> in <module>
55
56 #Load pretrained word2vec
---> 57 model = gensim.models.KeyedVectors.load('enwiki_20180420_500d.pkl',mmap='r')
58
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
1551 #classmethod
1552 def load(cls, fname_or_handle, **kwargs):
-> 1553 model = super(WordEmbeddingsKeyedVectors, cls).load(fname_or_handle, **kwargs)
1554 if isinstance(model, FastTextKeyedVectors):
1555 if not hasattr(model, 'compatible_hash'):
~/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py in load(cls, fname_or_handle, **kwargs)
226 #classmethod
227 def load(cls, fname_or_handle, **kwargs):
--> 228 return super(BaseKeyedVectors, cls).load(fname_or_handle, **kwargs)
229
230 def similarity(self, entity1, entity2):
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in load(cls, fname, mmap)
433 compress, subname = SaveLoad._adapt_by_suffix(fname)
434
--> 435 obj = unpickle(fname)
436 obj._load_specials(fname, mmap, compress, subname)
437 logger.info("loaded %s", fname)
~/anaconda3/lib/python3.7/site-packages/gensim/utils.py in unpickle(fname)
1396 # Because of loading from S3 load can't be used (missing readline in smart_open)
1397 if sys.version_info > (3, 0):
-> 1398 return _pickle.load(f, encoding='latin1')
1399 else:
1400 return _pickle.loads(f.read())
UnpicklingError: invalid load key, ':'.
I tried loading it with load_word2vec_format, but no luck. Any ideas what might be wrong with it?
Per your link https://wikipedia2vec.github.io/wikipedia2vec/pretrained/ these are to be loaded using that library's Wikipedia2Vec.load() method.
Gensim's .load() methods should only be used with files saved directly from Gensim model objects.
The Wikipedia2Vec project does say that their .txt file formats would load with .load_word2vec_format(), so you could also try that - but with one of their .txt format files.
Their full model .pkl files are only going to work with their class's own loading function.

Wahoo TICKR X .fit file reading/parsing and analysis in Python

Not sure if I can post a question like this here so please redirect me if I'm in the wrong place.
I've bought a Wahoo TICKR X to monitor my heart rate during exercise. Also I would like to get more familiar with python so i decided I would like do do the analyses of my heart rate myself in python instead of in the wahoo app. I thought this would also give more freedom in the choice of visualization, testing etc.
I've recorded my heart rate for 5 minutes or so and exported the .fit file. However I can't even find a suitable library to read the .fit file. Can anyone recommend a library that works with .fit file from wahoo?
I'm using ubuntu, anaconda, python 3.7
import pyfits
# Load the FITS file into the program
hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
# Load table data as tbdata
tbdata = hdulist[1].data
OSError Traceback (most recent call last)
<ipython-input-3-a970e2cd9dee> in <module>
2
3 # Load the FITS file into the program
----> 4 hdulist = pyfits.open('/home/bradmin/Downloads/2020-03-26.fit')
5
6 # Load table data as tbdata
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fitsopen(name, mode, memmap, save_backup, **kwargs)
122 raise ValueError('Empty filename: %s' % repr(name))
123
--> 124 return HDUList.fromfile(name, mode, memmap, save_backup, **kwargs)
125
126
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in fromfile(cls, fileobj, mode, memmap, save_backup, **kwargs)
264
265 return cls._readfrom(fileobj=fileobj, mode=mode, memmap=memmap,
--> 266 save_backup=save_backup, **kwargs)
267
268 #classmethod
~/anaconda3/lib/python3.7/site-packages/pyfits/hdu/hdulist.py in _readfrom(cls, fileobj, data, mode, memmap, save_backup, **kwargs)
853 # raise and exception
854 if mode in ('readonly', 'denywrite') and len(hdulist) == 0:
--> 855 raise IOError('Empty or corrupt FITS file')
856
857 # initialize/reset attributes to be used in "update/append" mode
OSError: Empty or corrupt FITS file
link to the file: https://wetransfer.com/downloads/6d054a5d52899aefcb1bcd22bda92ba120200326161849/b9831a
EDIT
I've tried this now but i get an error:
import fitdecode
src_file = "/home/bradmin/Downloads/2020-03-26.fit"
with fitdecode.FitReader(src_file) as fit:
for frame in fit:
# The yielded frame object is of one of the following types:
# * fitdecode.FitHeader
# * fitdecode.FitDefinitionMessage
# * fitdecode.FitDataMessage
# * fitdecode.FitCRC
if isinstance(frame, fitdecode.FitDataMessage):
# Here, frame is a FitDataMessage object.
# A FitDataMessage object contains decoded values that
# are directly usable in your script logic.
print(frame.name)
file_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
developer_data_id
field_description
field_description
field_description
field_description
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-7-e8d95d3087dc> in <module>
2
3 with fitdecode.FitReader(src_file) as fit:
----> 4 for frame in fit:
5 # The yielded frame object is of one of the following types:
6 # * fitdecode.FitHeader
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in __iter__(self)
191
192 def __iter__(self):
--> 193 yield from self._read_next()
194
195 #property
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_next(self)
298 assert self._header
299
--> 300 record = self._read_record()
301 if not record:
302 break
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _read_record(self)
443 self._add_dev_data_id(message)
444 elif message.mesg_type.mesg_num == profile.MESG_NUM_FIELD_DESCRIPTION:
--> 445 self._add_dev_field_description(message)
446
447 return message
~/anaconda3/lib/python3.7/site-packages/fitdecode/reader.py in _add_dev_field_description(self, message)
780 base_type_id = message.get_field('fit_base_type_id').raw_value
781 field_name = message.get_field('field_name').raw_value
--> 782 units = message.get_field('units').raw_value
783
784 try:
~/anaconda3/lib/python3.7/site-packages/fitdecode/records.py in get_field(self, field_name_or_num, idx)
188 raise KeyError(
189 f'field "{field_name_or_num}" (idx #{idx}) not found in ' +
--> 190 f'message "{self.name}"')
191
192 def get_fields(self, field_name_or_num):
KeyError: 'field "units" (idx #0) not found in message "field_description"'
The format seems to be this FIT format. pyfits is for an entirely different format, it seems.
The article above refers to a gpsbabel tool, which you could use to convert the FIT file to something more interoperable and usable, e.g. GPX (an XML-based format that's easy to parse).
Or, of course, if you want a pure-Python solution, you can port the FIT format reading bits from gpsbabel to Python use the fitdecode library.

Error when opening some gdb files with fiona and geopandas

I am trying to open NYC LION Geodatabase files for 2010, 2011, and 2012.
I successfully opened the 2012 and 2011 geodatabases with geopandas, but I was unable to open the 2010 version.
I've tried using fiona directly, but I kept getting a similar error.
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import requests
from zipfile import ZipFile as zzip
import fiona
sys.path.append(os.path.realpath('..'))
path = r"https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_lion10aav.zip"
r = requests.get(path)
# open method to open a file on your system and write the contents
with open("../input_data/nyc_lion10aav.zip", "wb") as file:
file.write(r.content)
# opening the zip file in READ mode
with zzip("../input_data/nyc_lion10aav.zip", 'r') as file:
# printing all the contents of the zip file
#file.printdir()
path = "../input_data/nyc_lion10aav"
os.mkdir(path)
# extracting all the files
#rint('Extracting all the files now...')
file.extractall(path)
print('Done!')
fp = r"../input_data/nyc_lion10aav/lion/lion.gdb"
lion_gdf = gpd.read_file(fp, driver='OpenFileGDB', layer='lion')
fp = r"../input_data/nyc_lion10aav/lion/lion.gdb"
file = fiona.open(fp, driver='OpenFileGDB', layer='lion')
Notebook
I expected it to go through like the geodatabases from 2011 and 2012 when I ran it in the notebook. I've been searching here and on fiona's github issues to see if others have a similar problem and if there was a solution. But I am fairly new to using these libraries so I don't really understand the traceback in order to figure out what went wrong.
---------------------------------------------------------------------------
CPLE_OpenFailedError Traceback (most recent call last)
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
fiona/_err.pyx in fiona._err.exc_wrap_pointer()
CPLE_OpenFailedError: ../input_data/nyc_lion10aav/lion/lion.gdb: Permission denied
During handling of the above exception, another exception occurred:
DriverError Traceback (most recent call last)
<ipython-input-14-f49f8c92c671> in <module>
1 fp = r"../input_data/nyc_lion10aav/lion/lion.gdb"
----> 2 lion_gdf = gpd.read_file(fp, driver='OpenFileGDB', layer='lion')
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\geopandas\io\file.py in read_file(filename, bbox, **kwargs)
75
76 with fiona_env():
---> 77 with reader(path_or_bytes, **kwargs) as features:
78
79 # In a future Fiona release the crs attribute of features will
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\fiona\env.py in wrapper(*args, **kwargs)
394 def wrapper(*args, **kwargs):
395 if local._env:
--> 396 return f(*args, **kwargs)
397 else:
398 if isinstance(args[0], str):
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\fiona\__init__.py in open(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)
251 if mode in ('a', 'r'):
252 c = Collection(path, mode, driver=driver, encoding=encoding,
--> 253 layer=layer, enabled_drivers=enabled_drivers, **kwargs)
254 elif mode == 'w':
255 if schema:
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\fiona\collection.py in __init__(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)
157 if self.mode == 'r':
158 self.session = Session()
--> 159 self.session.start(self, **kwargs)
160 elif self.mode in ('a', 'w'):
161 self.session = WritingSession()
fiona/ogrext.pyx in fiona.ogrext.Session.start()
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
DriverError: ../input_data/nyc_lion10aav/lion/lion.gdb: Permission denied

Categories