Skip to content

Commit

Permalink
fix(iorw.py): update HDFSHandler to adapt PyArrow.fs.HadoopFileSystem…
Browse files Browse the repository at this point in the history
… API (#658)

* Add conventional default arguments

* fix to adapt PyArrow Filesystem API

* fix hdfs tests

* fix format of test_hdfs.py
  • Loading branch information
reoono committed May 1, 2022
1 parent f3573cc commit 76906a8
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 13 deletions.
15 changes: 6 additions & 9 deletions papermill/iorw.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,10 @@
GCSFileSystem = missing_dependency_generator("gcsfs", "gcs")

try:
try:
from pyarrow.fs import HadoopFileSystem
except ImportError:
# Attempt the older package import pattern in case we're using an old dep version.
from pyarrow import HadoopFileSystem
from pyarrow.fs import HadoopFileSystem, FileSelector
except ImportError:
HadoopFileSystem = missing_dependency_generator("pyarrow", "hdfs")

try:
from github import Github
except ImportError:
Expand Down Expand Up @@ -358,18 +355,18 @@ def __init__(self):

def _get_client(self):
if self._client is None:
self._client = HadoopFileSystem()
self._client = HadoopFileSystem(host="default")
return self._client

def read(self, path):
with self._get_client().open(path, 'rb') as f:
with self._get_client().open_input_stream(path) as f:
return f.read()

def listdir(self, path):
return self._get_client().ls(path)
return [f.path for f in self._get_client().get_file_info(FileSelector(path))]

def write(self, buf, path):
with self._get_client().open(path, 'wb') as f:
with self._get_client().open_output_stream(path) as f:
return f.write(str.encode(buf))

def pretty_path(self, path):
Expand Down
12 changes: 9 additions & 3 deletions papermill/tests/test_hdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@


class MockHadoopFileSystem(MagicMock):
def ls(self, path):
return ['test1.ipynb', 'test2.ipynb']
def get_file_info(self, path):
return [MockFileInfo('test1.ipynb'), MockFileInfo('test2.ipynb')]

def open(self, path, *args):
def open_input_stream(self, path):
return MockHadoopFile()

def open_output_stream(self, path):
return MockHadoopFile()

class MockHadoopFile(object):
def __init__(self):
Expand All @@ -29,6 +31,10 @@ def write(self, new_content):
self._content = new_content
return 1

class MockFileInfo(object):
def __init__(self, path):
self.path = path


@patch('papermill.iorw.HadoopFileSystem', side_effect=MockHadoopFileSystem())
class HDFSTest(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion requirements/hdfs.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyarrow
pyarrow >= 2.0

0 comments on commit 76906a8

Please sign in to comment.