I am trying to write a dask dataframe to hdfs parquet using pyarrow engine in to_parquet api.
But the write is failing with below exception:
dask_df.to_parquet(parquet_path,engine=engine)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/dask/dataframe/core.py", line 985, in to_parquet
return to_parquet(self, path, *args, **kwargs)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/dask/dataframe/io/parquet.py", line 618, in to_parquet
out.compute()
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/dask/base.py", line 135, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/dask/base.py", line 333, in compute
results = get(dsk, keys, **kwargs)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/distributed/client.py", line 1999, in get
results = self.gather(packed, asynchronous=asynchronous)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/distributed/client.py", line 1437, in gather
asynchronous=asynchronous)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/distributed/client.py", line 592, in sync
return sync(self.loop, func, *args, **kwargs)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/distributed/utils.py", line 254, in sync
six.reraise(*error[0])
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/six.py", line 693, in reraise
raise value
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/distributed/utils.py", line 238, in f
result[0] = yield make_coro()
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "", line 4, in raise_exc_info
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/tornado/gen.py", line 1063, in run
yielded = self.gen.throw(*exc_info)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/distributed/client.py", line 1315, in _gather
traceback)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/six.py", line 692, in reraise
raise value.with_traceback(tb)
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/dask/dataframe/io/parquet.py", line 410, in _write_partition_pyarrow
import pyarrow as pa
File "/ebs/d1/agent/miniconda3/envs/dask-distributed/lib/python3.6/site-packages/pyarrow/__init__.py", line 113, in
import pyarrow.hdfs as hdfs
AttributeError: module 'pyarrow' has no attribute 'hdfs'
Version of pyarrow: 0.8.0 and Version of distributed: 1.20.2
But when I try importing the package in python console it does not have any error:
import pyarrow.hdfs as hdfs