Geopandas pickle incompatibility between 2 versions - python

Package versions when I dump geo data frame obj by pickle.
scikit-image=0.17.2
scikit-learn=0.22.1
geopandas==0.7.0
pandas=1.0.3
Dump geo data frame by pickle
import pandas as pd
import pickle
pickle_file_path = "geo_data_frame.pkl" # file is attached
gdf.to_pickle(pickle_file_path) # gdf is geo data frame obj for example like gdf = gpd.GeoDataFrame(df, crs="epsg:4326")
Package versions when I read pickled file.
scikit-image=0.19.2
scikit-learn=1.0.2
geopandas==0.10.2
pandas=1.0.3
Read pickle
import pandas as pd
import pickle
pickle_file_path = "geo_data_frame.pkl" # file is attached
gdf = pd.read_pickle(pickle_file_path)
Traceback:
---> 58 gdf = pd.read_pickle(pickle_file_path)
59 shutil.rmtree(tempdir)
60 return pickle.dumps(gdf)
/opt/conda/envs/env/lib/python3.8/site-packages/pandas/io/pickle.py in read_pickle(filepath_or_buffer, compression)
180 # We want to silence any warnings about, e.g. moved modules.
181 warnings.simplefilter("ignore", Warning)
--> 182 return pickle.load(f)
183 except excs_to_catch:
184 # e.g.
/opt/conda/envs/env/lib/python3.8/site-packages/geopandas/array.py in __setstate__(self, state)
422 def __setstate__(self, state):
423 if compat.USE_PYGEOS:
--> 424 geoms = pygeos.from_wkb(state[0])
425 self._crs = state[1]
426 self._sindex = None # pygeos.STRtree could not be pickled yet
KeyError: 0
You can directly download pickle file in above code from https://drive.google.com/file/d/1VTDaapxsy6DosMmrSv9mUtVzNBn_ISeN/view?usp=sharing
Screenshot in sentry

Related

How to Output Downloadable file after processing?

Specification
gr.__version__ --> '3.16.2'
I want to create a gradio tab in mygradio app
Disregard TAB 1, I am only working on tab2
where I upload an excel file
save name of the excel fie to a variable
process that excel file take data out of it 2 numbers (1 and 2)
Load data from the excel file to a pandas dataframe and add 1 to both of the numbers
Turn dataframe to excel again and output it to the user to be able to download the output excel file
The output file is named as the original uploaded file
MY CURRENT Code
import gradio as gr
import pandas as pd
# def func1():
# #....
# pass
def func2(name, file):
file_name = name
file_x = file
# use this function to retrieve the file_x without modification for gradio.io output
# excel to dataframe
df = pd.read_excel(file_x)
# add 1 to both numbers
df['1'] = df['1'] + 1
df['2'] = df['2'] + 1
# dataframe to excel
# returnt the exported excel fiel with the same name as the original file
return df.to_excel(file_x, index=False)
# GRADIO APP
with gr.Blocks() as demo:
gr.Markdown("BI App")
''' #1.TAB '''
# with gr.Tab("Tab1"):
# #.... unimportant code
# with gr.Column():
# file_obj = gr.File(label="Input File",
# file_count="single",
# file_types=["", ".", ".csv",".xls",".xlsx"]),
# # extract the filename from gradio.io file object
# # keyfile_name = gr.Interface(file_name_reader, inputs="file", outputs=None)
# keyfile_name = 'nothing'
# tab1_inputs = [keyfile_name, file_obj]
# with gr.Column():
# # output excel file with gradio.io
# tab1_outputs = [gr.File(label="Output File",
# file_count="single",
# file_types=["", ".", ".csv",".xls",".xlsx"])]
# tab1_submit_button = gr.Button("Submit")
''' #2.TAB - I EDIT THIS TAB'''
with gr.Tab("Tab2"):
admitad_invoice_approvals_button = gr.Button("Submit")
def file_name_reader(file):
file_name = file.name # extract the file name from the uploaded file
return file_name
# iface = gr.Interface(file_name_reader, inputs="file", outputs=None)
with gr.Column():
file_obj = gr.File(label="Input File",
file_count="single",
file_types=["", ".", ".csv",".xls",".xlsx"]),
# extract the filename from gradio.io file object
keyfile_name = gr.Interface(file_name_reader, inputs="file", outputs=None)
tab2_inputs = [keyfile_name, file_obj]
with gr.Column():
# output excel file with gradio.io
tab2_outputs = [gr.File(label="Output File",
file_count="single",
file_types=["", ".", ".csv",".xls",".xlsx"])]
tab2_submit_button = gr.Button("Submit")
'''1 button for each of the tabs to execute the GUI TASK'''
# tab1_submit_button.click(func1,
# inputs=tab1_inputs,
# outputs=tab1_outputs)
tab2_submit_button.click(func2,
inputs=tab2_inputs,
outputs=tab2_outputs)
''' EXECUTING THE APP'''
demo.launch(debug=True, share=True) ## PRODUCTION TESTING
ERROR:
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[7], line 95
90 '''1 button for each of the tabs to execute the GUI TASK'''
91 # tab1_submit_button.click(func1,
92 # inputs=tab1_inputs,
93 # outputs=tab1_outputs)
---> 95 tab2_submit_button.click(func2,
96 inputs=tab2_inputs,
97 outputs=tab2_outputs)
100 ''' EXECUTING THE APP'''
101 demo.launch(debug=True, share=True) ## PRODUCTION TESTING
File ~/.local/lib/python3.8/site-packages/gradio/events.py:145, in Clickable.click(self, fn, inputs, outputs, api_name, status_tracker, scroll_to_output, show_progress, queue, batch, max_batch_size, preprocess, postprocess, cancels, every, _js)
140 if status_tracker:
141 warnings.warn(
142 "The 'status_tracker' parameter has been deprecated and has no effect."
143 )
--> 145 dep = self.set_event_trigger(
146 "click",
147 fn,
148 inputs,
149 outputs,
150 preprocess=preprocess,
151 postprocess=postprocess,
152 scroll_to_output=scroll_to_output,
153 show_progress=show_progress,
154 api_name=api_name,
155 js=_js,
156 queue=queue,
157 batch=batch,
158 max_batch_size=max_batch_size,
159 every=every,
160 )
161 set_cancel_events(self, "click", cancels)
162 return dep
File ~/.local/lib/python3.8/site-packages/gradio/blocks.py:225, in Block.set_event_trigger(self, event_name, fn, inputs, outputs, preprocess, postprocess, scroll_to_output, show_progress, api_name, js, no_target, queue, batch, max_batch_size, cancels, every)
217 warnings.warn(
218 "api_name {} already exists, using {}".format(api_name, api_name_)
219 )
220 api_name = api_name_
222 dependency = {
223 "targets": [self._id] if not no_target else [],
224 "trigger": event_name,
...
237 }
238 Context.root_block.dependencies.append(dependency)
239 return dependency
AttributeError: 'tuple' object has no attribute '_id'
Tried
I have looked in to https://gradio.app/docs/#file but the output file generation is not clean especially regarding applying it to my case
Instead of
with gr.Column():
file_obj = gr.File(label="Input File"
# no any other arguments
)
input= file_obj
Just have
with gr.Column():
file_obj = gr.File(label="Input File")
input= file_obj

Read shapefile from HDFS with geopandas

I have a shapefile on my HDFS and I would like to import it in my Jupyter Notebook with geopandas (version 0.8.1).
I tried the standard read_file() method but it does not recognize the HDFS directory; instead I believe it searches in my local directory, as I made a test with the local directory and reads the shapefile correctly.
This is the code I used:
import geopandas as gpd
shp = gpd.read_file('hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp')
and the error I obtained:
---------------------------------------------------------------------------
CPLE_OpenFailedError Traceback (most recent call last)
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
fiona/_err.pyx in fiona._err.exc_wrap_pointer()
CPLE_OpenFailedError: hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp: No such file or directory
During handling of the above exception, another exception occurred:
DriverError Traceback (most recent call last)
<ipython-input-17-3118e740e4a9> in <module>
----> 2 shp = gpd.read_file('hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp' class="ansi-blue-fg">)
3 print(shp.shape)
4 shp.head(3)
/opt/venv/geocoding/lib/python3.6/site-packages/geopandas/io/file.py in _read_file(filename, bbox, mask, rows, **kwargs)
94
95 with fiona_env():
---> 96 with reader(path_or_bytes, **kwargs) as features:
97
98 # In a future Fiona release the crs attribute of features will
/opt/venv/geocoding/lib/python3.6/site-packages/fiona/env.py in wrapper(*args, **kwargs)
398 def wrapper(*args, **kwargs):
399 if local._env:
--> 400 return f(*args, **kwargs)
401 else:
402 if isinstance(args[0], str):
/opt/venv/geocoding/lib/python3.6/site-packages/fiona/__init__.py in open(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)
255 if mode in ('a', 'r'):
256 c = Collection(path, mode, driver=driver, encoding=encoding,
--> 257 layer=layer, enabled_drivers=enabled_drivers, **kwargs)
258 elif mode == 'w':
259 if schema:
/opt/venv/geocoding/lib/python3.6/site-packages/fiona/collection.py in __init__(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)
160 if self.mode == 'r':
161 self.session = Session()
--> 162 self.session.start(self, **kwargs)
163 elif self.mode in ('a', 'w'):
164 self.session = WritingSession()
fiona/ogrext.pyx in fiona.ogrext.Session.start()
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
DriverError: hdfs://hdfsha/my_hdfs_directory/my_shapefile.shp: No such file or directory
So, I was wondering whether it is actually possible to read a shapefile, stored in HDFS, with geopandas. If yes, how?
If someone is still looking for an answer to this question, I managed to find a workaround.
First of all, you need a .zip file which contains all the data related to your shapefile (.shp, .shx, .dbf, ...). Then, we use pyarrow to establish a connection to HDFS and fiona to read the zipped shapefile.
Package versions I'm using:
pyarrow==2.0.0
fiona==1.8.18
The code:
# import packages
import pandas as pd
import geopandas as gpd
import fiona
import pyarrow
# establish a connection to HDFS
fs = pyarrow.hdfs.connect()
# read zipped shapefile
with fiona.io.ZipMemoryFile(fs.open('hdfs://my_hdfs_directory/my_zipped_shapefile.zip')) as z:
with z.open('my_shp_file_within_zip.shp') as collection:
gdf = gpd.GeoDataFrame.from_features(collection)
print(gdf.shape)

Python, get an error when using np.dstack and can’t solve it

When running the following code
mport matplotlib.pyplot as plt
import numpy as np
import skimage, skimage.io
from skimage.color import rgb2hsv
from skimage import io
ic = skimage.io.imread_collection('/Users/ /remoteSensing/Image/Landsat/*.tif')
img = np.dstack((ic[5],ic[4],ic[3]))
I got the following error.
TypeError: unexpected keyword argument: img_num
I read in an old question that there was a bug in the older version of NumPy and scikit-image but my versions are
numpy 1.19.1
scikit-image 0.17.2
tifffile 0.15.1
I would appreciate any help to solve this error.
here is the link of the folder /Landsat
https://1drv.ms/u/s!AizvpupSaqfjhzcdFz173WMZfo_J?e=N1UaeL
this is the traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-ee45cc9a6663> in <module>
----> 1 img = np.dstack((ic[5],ic[4],ic[3]))
2
/Applications/anaconda3/lib/python3.7/site-packages/skimage/io/collection.py in __getitem__(self, n)
274 kwargs['img_num'] = img_num
275 try:
--> 276 self.data[idx] = self.load_func(fname, **kwargs)
277 # Account for functions that do not accept an img_num kwarg
278 except TypeError as e:
/Applications/anaconda3/lib/python3.7/site-packages/skimage/io/_plugins/imageio_plugin.py in imread(*args, **kwargs)
8 #wraps(imageio_imread)
9 def imread(*args, **kwargs):
---> 10 return np.asarray(imageio_imread(*args, **kwargs))
/Applications/anaconda3/lib/python3.7/site-packages/imageio/core/functions.py in imread(uri, format, **kwargs)
263
264 # Get reader and read first
--> 265 reader = read(uri, format, "i", **kwargs)
266 with reader:
267 return reader.get_data(0)
/Applications/anaconda3/lib/python3.7/site-packages/imageio/core/functions.py in get_reader(uri, format, mode, **kwargs)
184
185 # Return its reader object
--> 186 return format.get_reader(request)
187
188
/Applications/anaconda3/lib/python3.7/site-packages/imageio/core/format.py in get_reader(self, request)
168 "Format %s cannot read in %s mode" % (self.name, modename)
169 )
--> 170 return self.Reader(self, request)
171
172 def get_writer(self, request):
/Applications/anaconda3/lib/python3.7/site-packages/imageio/core/format.py in __init__(self, format, request)
219 self._request = request
220 # Open the reader/writer
--> 221 self._open(**self.request.kwargs.copy())
222
223 #property
/Applications/anaconda3/lib/python3.7/site-packages/imageio/plugins/tifffile.py in _open(self, **kwargs)
224 self._f = None
225 f = self.request.get_file()
--> 226 self._tf = _tifffile.TiffFile(f, **kwargs)
227
228 # metadata is the same for all images
/Applications/anaconda3/lib/python3.7/site-packages/tifffile/tifffile.py in __init__(self, arg, name, offset, size, multifile, _useframes, _master, **kwargs)
2461 setattr(self, key, bool(value))
2462 else:
-> 2463 raise TypeError(f'unexpected keyword argument: {key}')
2464
2465 fh = FileHandle(arg, mode='rb', name=name, offset=offset, size=size)
TypeError: unexpected keyword argument: img_num
Can you try updating your scikit-image version and installing tifffile as well? scikit-image 0.17 changed the way tiff files are read and that might fix the problem.
Additionally, as pointed out by hpaulj, you need to pass in a list/tuple to np.dstack:
np.dstack((ic[5], ic[4], ic[3]))
(note the extra set of parentheses.)
Update
We finally figured this out on the skimage issue tracker. The fix is to specify the tifffile plugin when calling imread_collection:
import matplotlib.pyplot as plt
import numpy as np
import skimage, skimage.io
from skimage.color import rgb2hsv
from skimage import io
ic = skimage.io.imread_collection(
'/Users/ /remoteSensing/Image/Landsat/*.tif', plugin='tifffile'
)
img = np.dstack((ic[5],ic[4],ic[3]))

Error when opening some gdb files with fiona and geopandas

I am trying to open NYC LION Geodatabase files for 2010, 2011, and 2012.
I successfully opened the 2012 and 2011 geodatabases with geopandas, but I was unable to open the 2010 version.
I've tried using fiona directly, but I kept getting a similar error.
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import requests
from zipfile import ZipFile as zzip
import fiona
sys.path.append(os.path.realpath('..'))
path = r"https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_lion10aav.zip"
r = requests.get(path)
# open method to open a file on your system and write the contents
with open("../input_data/nyc_lion10aav.zip", "wb") as file:
file.write(r.content)
# opening the zip file in READ mode
with zzip("../input_data/nyc_lion10aav.zip", 'r') as file:
# printing all the contents of the zip file
#file.printdir()
path = "../input_data/nyc_lion10aav"
os.mkdir(path)
# extracting all the files
#rint('Extracting all the files now...')
file.extractall(path)
print('Done!')
fp = r"../input_data/nyc_lion10aav/lion/lion.gdb"
lion_gdf = gpd.read_file(fp, driver='OpenFileGDB', layer='lion')
fp = r"../input_data/nyc_lion10aav/lion/lion.gdb"
file = fiona.open(fp, driver='OpenFileGDB', layer='lion')
Notebook
I expected it to go through like the geodatabases from 2011 and 2012 when I ran it in the notebook. I've been searching here and on fiona's github issues to see if others have a similar problem and if there was a solution. But I am fairly new to using these libraries so I don't really understand the traceback in order to figure out what went wrong.
---------------------------------------------------------------------------
CPLE_OpenFailedError Traceback (most recent call last)
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
fiona/_err.pyx in fiona._err.exc_wrap_pointer()
CPLE_OpenFailedError: ../input_data/nyc_lion10aav/lion/lion.gdb: Permission denied
During handling of the above exception, another exception occurred:
DriverError Traceback (most recent call last)
<ipython-input-14-f49f8c92c671> in <module>
1 fp = r"../input_data/nyc_lion10aav/lion/lion.gdb"
----> 2 lion_gdf = gpd.read_file(fp, driver='OpenFileGDB', layer='lion')
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\geopandas\io\file.py in read_file(filename, bbox, **kwargs)
75
76 with fiona_env():
---> 77 with reader(path_or_bytes, **kwargs) as features:
78
79 # In a future Fiona release the crs attribute of features will
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\fiona\env.py in wrapper(*args, **kwargs)
394 def wrapper(*args, **kwargs):
395 if local._env:
--> 396 return f(*args, **kwargs)
397 else:
398 if isinstance(args[0], str):
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\fiona\__init__.py in open(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)
251 if mode in ('a', 'r'):
252 c = Collection(path, mode, driver=driver, encoding=encoding,
--> 253 layer=layer, enabled_drivers=enabled_drivers, **kwargs)
254 elif mode == 'w':
255 if schema:
~\AppData\Local\Continuum\anaconda3\envs\geo\lib\site-packages\fiona\collection.py in __init__(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)
157 if self.mode == 'r':
158 self.session = Session()
--> 159 self.session.start(self, **kwargs)
160 elif self.mode in ('a', 'w'):
161 self.session = WritingSession()
fiona/ogrext.pyx in fiona.ogrext.Session.start()
fiona/_shim.pyx in fiona._shim.gdal_open_vector()
DriverError: ../input_data/nyc_lion10aav/lion/lion.gdb: Permission denied

Can't import json data from Google Storage Cloud into Pandas

I've an issue with reading data from a json file residing in a google storage bucket into a dataframe-
%env GOOGLE_APPLICATION_CREDENTIALS = /my_path/artcollect/google-api-keys.json
import pandas as pd
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('my_bucket')
blob = bucket.get_blob('my_data.json')
content = blob.download_as_string()
When running
lots = pd.read_json(content)
I'm getting a ValueError: Trailing data. content returns
b'[\n{\n "auction_house_name": "Phillips",\n "url": "https://www.phillips.com/detail/ARTURO-HERRERA/NY000312/1",\n "sale_id": "NY000312", .....
I tried following:
c = re.sub(' +',' ',content.decode('utf-8').replace('\n', ""))
to remove the newlines and multiple spaces. c returns
'[{ "auction_house_name": "Phillips", "url": "https://www.phillips.com/detail/ARTURO-HERRERA/NY000312/1", "sale_id": "NY000312", .....
pd.read_json(c) still returns the ValueError: Trailing data. The full error log:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-154-e9efacd0aab2> in <module>()
----> 1 lots = pd.read_json(c)
~/anaconda/lib/python3.6/site-packages/pandas/io/json/json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines)
352 obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
353 keep_default_dates, numpy, precise_float,
--> 354 date_unit).parse()
355
356 if typ == 'series' or obj is None:
~/anaconda/lib/python3.6/site-packages/pandas/io/json/json.py in parse(self)
420
421 else:
--> 422 self._parse_no_numpy()
423
424 if self.obj is None:
~/anaconda/lib/python3.6/site-packages/pandas/io/json/json.py in _parse_no_numpy(self)
637 if orient == "columns":
638 self.obj = DataFrame(
--> 639 loads(json, precise_float=self.precise_float), dtype=None)
640 elif orient == "split":
641 decoded = dict((str(k), v)
ValueError: Trailing data
What am I doing wrong?
The issue was not really related to Google Storage or misused Pandas functionality. The problem was that my json file was not correctly formatted. The reason was that the process which created the json data required that there was an empty or no file. In my test scenario I started the process a couple of times, so at the end my data file had a couple of lists appended to each other such as
[....... json content 1
]
[....... json content 2
]
I found this out by searching for ][.
The way how I access a json file residing in a Google bucket within Jupyter was correct:
%env GOOGLE_APPLICATION_CREDENTIALS = /my_path/google-api-keys.json
import pandas as pd
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('my_bucket')
blob = bucket.get_blob('my_data.json')
content = blob.download_as_string()
my_df = pd.read_json(content)
The Google API keys file can be created from the console via APIs & Services > Credentials > Create credentials > Service Account key > select JSON

Categories