Skip to content

Commit

Permalink
Support get file(notebook) md5 (#1363)
Browse files Browse the repository at this point in the history
  • Loading branch information
Wh1isper committed Nov 19, 2023
1 parent 8ed8b33 commit ea6ceee
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 38 deletions.
9 changes: 9 additions & 0 deletions docs/source/developers/contents.rst
Expand Up @@ -63,6 +63,10 @@ Models may contain the following entries:
| |``None`` |if any. (:ref:`See |
| | |Below<modelcontent>`) |
+--------------------+-----------+------------------------------+
|**md5** |unicode or |The md5 of the contents. |
| |``None`` | |
| | | |
+--------------------+-----------+------------------------------+

.. _modelcontent:

Expand All @@ -76,6 +80,8 @@ model. There are three model types: **notebook**, **file**, and **directory**.
:class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file
represented by the model. See the `NBFormat`_ documentation for a full
description.
- The ``md5`` field a hexdigest string of the md5 value of the notebook
file.

- ``file`` models
- The ``format`` field is either ``"text"`` or ``"base64"``.
Expand All @@ -85,12 +91,14 @@ model. There are three model types: **notebook**, **file**, and **directory**.
file models, ``content`` simply contains the file's bytes after decoding
as UTF-8. Non-text (``base64``) files are read as bytes, base64 encoded,
and then decoded as UTF-8.
- The ``md5`` field a hexdigest string of the md5 value of the file.

- ``directory`` models
- The ``format`` field is always ``"json"``.
- The ``mimetype`` field is always ``None``.
- The ``content`` field contains a list of :ref:`content-free<contentfree>`
models representing the entities in the directory.
- The ``md5`` field is always ``None``.

.. note::

Expand Down Expand Up @@ -129,6 +137,7 @@ model. There are three model types: **notebook**, **file**, and **directory**.
"path": "foo/a.ipynb",
"type": "notebook",
"writable": True,
"md5": "7e47382b370c05a1b14706a2a8aff91a",
}
# Notebook Model without Content
Expand Down
25 changes: 24 additions & 1 deletion jupyter_server/services/contents/fileio.py
Expand Up @@ -4,6 +4,7 @@
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
import errno
import hashlib
import os
import shutil
from base64 import decodebytes, encodebytes
Expand Down Expand Up @@ -268,7 +269,9 @@ def _read_notebook(self, os_path, as_version=4, capture_validation_error=None):
with self.open(os_path, "r", encoding="utf-8") as f:
try:
return nbformat.read(
f, as_version=as_version, capture_validation_error=capture_validation_error
f,
as_version=as_version,
capture_validation_error=capture_validation_error,
)
except Exception as e:
e_orig = e
Expand Down Expand Up @@ -309,13 +312,17 @@ def _read_file(self, os_path, format):
format:
If 'text', the contents will be decoded as UTF-8.
If 'base64', the raw bytes contents will be encoded as base64.
If 'byte', the raw bytes contents will be returned.
If not specified, try to decode as UTF-8, and fall back to base64
"""
if not os.path.isfile(os_path):
raise HTTPError(400, "Cannot read non-file %s" % os_path)

with self.open(os_path, "rb") as f:
bcontent = f.read()
if format == "byte":
# Not for http response but internal use
return bcontent, "byte"

if format is None or format == "text":
# Try to interpret as unicode if format is unknown or if unicode
Expand Down Expand Up @@ -350,6 +357,12 @@ def _save_file(self, os_path, content, format):
with self.atomic_writing(os_path, text=False) as f:
f.write(bcontent)

def _get_md5(self, os_path):
c, _ = self._read_file(os_path, "byte")
md5 = hashlib.md5() # noqa: S324
md5.update(c)
return md5.hexdigest()


class AsyncFileManagerMixin(FileManagerMixin):
"""
Expand Down Expand Up @@ -417,13 +430,17 @@ async def _read_file(self, os_path, format):
format:
If 'text', the contents will be decoded as UTF-8.
If 'base64', the raw bytes contents will be encoded as base64.
If 'byte', the raw bytes contents will be returned.
If not specified, try to decode as UTF-8, and fall back to base64
"""
if not os.path.isfile(os_path):
raise HTTPError(400, "Cannot read non-file %s" % os_path)

with self.open(os_path, "rb") as f:
bcontent = await run_sync(f.read)
if format == "byte":
# Not for http response but internal use
return bcontent, "byte"

if format is None or format == "text":
# Try to interpret as unicode if format is unknown or if unicode
Expand Down Expand Up @@ -457,3 +474,9 @@ async def _save_file(self, os_path, content, format):

with self.atomic_writing(os_path, text=False) as f:
await run_sync(f.write, bcontent)

async def _get_md5(self, os_path):
c, _ = await self._read_file(os_path, "byte")
md5 = hashlib.md5() # noqa: S324
await run_sync(md5.update, c)
return md5.hexdigest()
43 changes: 31 additions & 12 deletions jupyter_server/services/contents/filemanager.py
Expand Up @@ -268,6 +268,7 @@ def _base_model(self, path):
model["mimetype"] = None
model["size"] = size
model["writable"] = self.is_writable(path)
model["md5"] = None

return model

Expand Down Expand Up @@ -335,7 +336,7 @@ def _dir_model(self, path, content=True):

return model

def _file_model(self, path, content=True, format=None):
def _file_model(self, path, content=True, format=None, md5=False):
"""Build a model for a file
if content is requested, include the file contents.
Expand Down Expand Up @@ -364,10 +365,13 @@ def _file_model(self, path, content=True, format=None):
content=content,
format=format,
)
if md5:
md5 = self._get_md5(os_path)
model.update(md5=md5)

return model

def _notebook_model(self, path, content=True):
def _notebook_model(self, path, content=True, md5=False):
"""Build a notebook model
if content is requested, the notebook content will be populated
Expand All @@ -386,10 +390,12 @@ def _notebook_model(self, path, content=True):
model["content"] = nb
model["format"] = "json"
self.validate_notebook_model(model, validation_error)
if md5:
model["md5"] = self._get_md5(os_path)

return model

def get(self, path, content=True, type=None, format=None):
def get(self, path, content=True, type=None, format=None, md5=None):
"""Takes a path for an entity and returns its model
Parameters
Expand All @@ -404,6 +410,8 @@ def get(self, path, content=True, type=None, format=None):
format : str, optional
The requested format for file contents. 'text' or 'base64'.
Ignored if this returns a notebook or directory model.
md5: bool, optional
Whether to include the md5 of the file contents.
Returns
-------
Expand Down Expand Up @@ -431,11 +439,11 @@ def get(self, path, content=True, type=None, format=None):
)
model = self._dir_model(path, content=content)
elif type == "notebook" or (type is None and path.endswith(".ipynb")):
model = self._notebook_model(path, content=content)
model = self._notebook_model(path, content=content, md5=md5)
else:
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = self._file_model(path, content=content, format=format)
model = self._file_model(path, content=content, format=format, md5=md5)
self.emit(data={"action": "get", "path": path})
return model

Expand Down Expand Up @@ -686,7 +694,9 @@ def _get_dir_size(self, path="."):
).stdout.split()
else:
result = subprocess.run(
["du", "-s", "--block-size=1", path], capture_output=True, check=True
["du", "-s", "--block-size=1", path],
capture_output=True,
check=True,
).stdout.split()

self.log.info(f"current status of du command {result}")
Expand Down Expand Up @@ -784,7 +794,7 @@ async def _dir_model(self, path, content=True):

return model

async def _file_model(self, path, content=True, format=None):
async def _file_model(self, path, content=True, format=None, md5=False):
"""Build a model for a file
if content is requested, include the file contents.
Expand Down Expand Up @@ -813,10 +823,13 @@ async def _file_model(self, path, content=True, format=None):
content=content,
format=format,
)
if md5:
md5 = await self._get_md5(os_path)
model.update(md5=md5)

return model

async def _notebook_model(self, path, content=True):
async def _notebook_model(self, path, content=True, md5=False):
"""Build a notebook model
if content is requested, the notebook content will be populated
Expand All @@ -835,10 +848,12 @@ async def _notebook_model(self, path, content=True):
model["content"] = nb
model["format"] = "json"
self.validate_notebook_model(model, validation_error)
if md5:
model["md5"] = await self._get_md5(os_path)

return model

async def get(self, path, content=True, type=None, format=None):
async def get(self, path, content=True, type=None, format=None, md5=False):
"""Takes a path for an entity and returns its model
Parameters
Expand All @@ -853,6 +868,8 @@ async def get(self, path, content=True, type=None, format=None):
format : str, optional
The requested format for file contents. 'text' or 'base64'.
Ignored if this returns a notebook or directory model.
md5: bool, optional
Whether to include the md5 of the file contents.
Returns
-------
Expand All @@ -875,11 +892,11 @@ async def get(self, path, content=True, type=None, format=None):
)
model = await self._dir_model(path, content=content)
elif type == "notebook" or (type is None and path.endswith(".ipynb")):
model = await self._notebook_model(path, content=content)
model = await self._notebook_model(path, content=content, md5=md5)
else:
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = await self._file_model(path, content=content, format=format)
model = await self._file_model(path, content=content, format=format, md5=md5)
self.emit(data={"action": "get", "path": path})
return model

Expand Down Expand Up @@ -1147,7 +1164,9 @@ async def _get_dir_size(self, path: str = ".") -> str:
).stdout.split()
else:
result = subprocess.run(
["du", "-s", "--block-size=1", path], capture_output=True, check=True
["du", "-s", "--block-size=1", path],
capture_output=True,
check=True,
).stdout.split()

self.log.info(f"current status of du command {result}")
Expand Down

0 comments on commit ea6ceee

Please sign in to comment.