Skip to content

Commit

Permalink
Read notebook from github (#622)
Browse files Browse the repository at this point in the history
* Add handler for github input

* Fix indent

* Fix github argument
  • Loading branch information
onevirus committed Jul 27, 2021
1 parent e2f23ca commit b5b787e
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 3 deletions.
45 changes: 42 additions & 3 deletions papermill/iorw.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,16 @@
GCSFileSystem = missing_dependency_generator("gcsfs", "gcs")
try:
try:
from pyarrow.fs import HadoopFileSystem
from pyarrow.fs import HadoopFileSystem
except ImportError:
# Attempt the older package import pattern in case we're using an old dep version.
from pyarrow import HadoopFileSystem
# Attempt the older package import pattern in case we're using an old dep version.
from pyarrow import HadoopFileSystem
except ImportError:
HadoopFileSystem = missing_dependency_generator("pyarrow", "hdfs")
try:
from github import Github
except ImportError:
Github = missing_dependency_generator("pygithub", "github")


def fallback_gs_is_retriable(e):
Expand Down Expand Up @@ -365,6 +369,39 @@ def pretty_path(self, path):
return path


class GithubHandler(object):
def __init__(self):
self._client = None

def _get_client(self):
if self._client is None:
token = os.environ.get('GITHUB_ACCESS_TOKEN', None)
if token:
self._client = Github(token)
else:
self._client = Github()
return self._client

def read(self, path):
splits = path.split('/')
org_id = splits[3]
repo_id = splits[4]
ref_id = splits[6]
sub_path = '/'.join(splits[7:])
repo = self._get_client().get_repo(org_id + '/' + repo_id)
content = repo.get_contents(sub_path, ref=ref_id)
return content.decoded_content

def listdir(self, path):
raise PapermillException('listdir is not supported by GithubHandler')

def write(self, buf, path):
raise PapermillException('write is not supported by GithubHandler')

def pretty_path(self, path):
return path


# Hack to make YAML loader not auto-convert datetimes
# https://stackoverflow.com/a/52312810
class NoDatesSafeLoader(yaml.SafeLoader):
Expand All @@ -384,6 +421,8 @@ class NoDatesSafeLoader(yaml.SafeLoader):
papermill_io.register("https://", HttpHandler)
papermill_io.register("gs://", GCSHandler())
papermill_io.register("hdfs://", HDFSHandler())
papermill_io.register("http://github.com/", GithubHandler())
papermill_io.register("https://github.com/", GithubHandler())
papermill_io.register_entry_points()


Expand Down
1 change: 1 addition & 0 deletions requirements/github.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PyGithub >= 1.55
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def read_reqs(fname, folder=None):
azure_reqs = read_reqs('azure.txt', folder='requirements')
gcs_reqs = read_reqs('gcs.txt', folder='requirements')
hdfs_reqs = read_reqs('hdfs.txt', folder='requirements')
github_reqs = read_reqs('github.txt', folder='requirements')
all_reqs = s3_reqs + azure_reqs + gcs_reqs + hdfs_reqs
dev_reqs = read_reqs('dev.txt', folder='requirements') + all_reqs
extras_require = {
Expand All @@ -53,6 +54,7 @@ def read_reqs(fname, folder=None):
"azure": azure_reqs,
"gcs": gcs_reqs,
"hdfs": hdfs_reqs,
"github": github_reqs,
}

# Get the long description from the README file
Expand Down

0 comments on commit b5b787e

Please sign in to comment.