Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support get file(notebook) md5 #1363

Merged
merged 6 commits into from Nov 19, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/developers/contents.rst
Expand Up @@ -63,6 +63,10 @@ Models may contain the following entries:
| |``None`` |if any. (:ref:`See |
| | |Below<modelcontent>`) |
+--------------------+-----------+------------------------------+
|**md5** |unicode or |The md5 of the contents. |
| |``None`` | |
| | | |
+--------------------+-----------+------------------------------+

.. _modelcontent:

Expand All @@ -76,6 +80,8 @@ model. There are three model types: **notebook**, **file**, and **directory**.
:class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file
represented by the model. See the `NBFormat`_ documentation for a full
description.
- The ``md5`` field a hexdigest string of the md5 value of the notebook
file.

- ``file`` models
- The ``format`` field is either ``"text"`` or ``"base64"``.
Expand All @@ -85,12 +91,14 @@ model. There are three model types: **notebook**, **file**, and **directory**.
file models, ``content`` simply contains the file's bytes after decoding
as UTF-8. Non-text (``base64``) files are read as bytes, base64 encoded,
and then decoded as UTF-8.
- The ``md5`` field a hexdigest string of the md5 value of the file.

- ``directory`` models
- The ``format`` field is always ``"json"``.
- The ``mimetype`` field is always ``None``.
- The ``content`` field contains a list of :ref:`content-free<contentfree>`
models representing the entities in the directory.
- The ``md5`` field is always ``None``.

.. note::

Expand Down Expand Up @@ -129,6 +137,7 @@ model. There are three model types: **notebook**, **file**, and **directory**.
"path": "foo/a.ipynb",
"type": "notebook",
"writable": True,
"md5": "7e47382b370c05a1b14706a2a8aff91a",
}

# Notebook Model without Content
Expand Down
25 changes: 24 additions & 1 deletion jupyter_server/services/contents/fileio.py
Expand Up @@ -4,6 +4,7 @@
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
import errno
import hashlib
import os
import shutil
from base64 import decodebytes, encodebytes
Expand Down Expand Up @@ -268,7 +269,9 @@ def _read_notebook(self, os_path, as_version=4, capture_validation_error=None):
with self.open(os_path, "r", encoding="utf-8") as f:
try:
return nbformat.read(
f, as_version=as_version, capture_validation_error=capture_validation_error
f,
as_version=as_version,
capture_validation_error=capture_validation_error,
)
except Exception as e:
e_orig = e
Expand Down Expand Up @@ -309,13 +312,17 @@ def _read_file(self, os_path, format):
format:
If 'text', the contents will be decoded as UTF-8.
If 'base64', the raw bytes contents will be encoded as base64.
If 'byte', the raw bytes contents will be returned.
If not specified, try to decode as UTF-8, and fall back to base64
"""
if not os.path.isfile(os_path):
raise HTTPError(400, "Cannot read non-file %s" % os_path)

with self.open(os_path, "rb") as f:
bcontent = f.read()
if format == "byte":
# Not for http response but internal use
return bcontent, "byte"

if format is None or format == "text":
# Try to interpret as unicode if format is unknown or if unicode
Expand Down Expand Up @@ -350,6 +357,12 @@ def _save_file(self, os_path, content, format):
with self.atomic_writing(os_path, text=False) as f:
f.write(bcontent)

def _get_md5(self, os_path):
c, _ = self._read_file(os_path, "byte")
md5 = hashlib.md5() # noqa: S324
md5.update(c)
return md5.hexdigest()


class AsyncFileManagerMixin(FileManagerMixin):
"""
Expand Down Expand Up @@ -417,13 +430,17 @@ async def _read_file(self, os_path, format):
format:
If 'text', the contents will be decoded as UTF-8.
If 'base64', the raw bytes contents will be encoded as base64.
If 'byte', the raw bytes contents will be returned.
If not specified, try to decode as UTF-8, and fall back to base64
"""
if not os.path.isfile(os_path):
raise HTTPError(400, "Cannot read non-file %s" % os_path)

with self.open(os_path, "rb") as f:
bcontent = await run_sync(f.read)
if format == "byte":
# Not for http response but internal use
return bcontent, "byte"

if format is None or format == "text":
# Try to interpret as unicode if format is unknown or if unicode
Expand Down Expand Up @@ -457,3 +474,9 @@ async def _save_file(self, os_path, content, format):

with self.atomic_writing(os_path, text=False) as f:
await run_sync(f.write, bcontent)

async def _get_md5(self, os_path):
c, _ = await self._read_file(os_path, "byte")
md5 = hashlib.md5() # noqa: S324
await run_sync(md5.update, c)
return md5.hexdigest()
43 changes: 31 additions & 12 deletions jupyter_server/services/contents/filemanager.py
Expand Up @@ -268,6 +268,7 @@ def _base_model(self, path):
model["mimetype"] = None
model["size"] = size
model["writable"] = self.is_writable(path)
model["md5"] = None

return model

Expand Down Expand Up @@ -335,7 +336,7 @@ def _dir_model(self, path, content=True):

return model

def _file_model(self, path, content=True, format=None):
def _file_model(self, path, content=True, format=None, md5=False):
"""Build a model for a file

if content is requested, include the file contents.
Expand Down Expand Up @@ -364,10 +365,13 @@ def _file_model(self, path, content=True, format=None):
content=content,
format=format,
)
if md5:
md5 = self._get_md5(os_path)
model.update(md5=md5)

return model

def _notebook_model(self, path, content=True):
def _notebook_model(self, path, content=True, md5=False):
"""Build a notebook model

if content is requested, the notebook content will be populated
Expand All @@ -386,10 +390,12 @@ def _notebook_model(self, path, content=True):
model["content"] = nb
model["format"] = "json"
self.validate_notebook_model(model, validation_error)
if md5:
model["md5"] = self._get_md5(os_path)

return model

def get(self, path, content=True, type=None, format=None):
def get(self, path, content=True, type=None, format=None, md5=None):
"""Takes a path for an entity and returns its model

Parameters
Expand All @@ -404,6 +410,8 @@ def get(self, path, content=True, type=None, format=None):
format : str, optional
The requested format for file contents. 'text' or 'base64'.
Ignored if this returns a notebook or directory model.
md5: bool, optional
Whether to include the md5 of the file contents.

Returns
-------
Expand Down Expand Up @@ -431,11 +439,11 @@ def get(self, path, content=True, type=None, format=None):
)
model = self._dir_model(path, content=content)
elif type == "notebook" or (type is None and path.endswith(".ipynb")):
model = self._notebook_model(path, content=content)
model = self._notebook_model(path, content=content, md5=md5)
else:
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = self._file_model(path, content=content, format=format)
model = self._file_model(path, content=content, format=format, md5=md5)
self.emit(data={"action": "get", "path": path})
return model

Expand Down Expand Up @@ -686,7 +694,9 @@ def _get_dir_size(self, path="."):
).stdout.split()
else:
result = subprocess.run(
["du", "-s", "--block-size=1", path], capture_output=True, check=True
["du", "-s", "--block-size=1", path],
capture_output=True,
check=True,
).stdout.split()

self.log.info(f"current status of du command {result}")
Expand Down Expand Up @@ -784,7 +794,7 @@ async def _dir_model(self, path, content=True):

return model

async def _file_model(self, path, content=True, format=None):
async def _file_model(self, path, content=True, format=None, md5=False):
"""Build a model for a file

if content is requested, include the file contents.
Expand Down Expand Up @@ -813,10 +823,13 @@ async def _file_model(self, path, content=True, format=None):
content=content,
format=format,
)
if md5:
md5 = await self._get_md5(os_path)
model.update(md5=md5)

return model

async def _notebook_model(self, path, content=True):
async def _notebook_model(self, path, content=True, md5=False):
"""Build a notebook model

if content is requested, the notebook content will be populated
Expand All @@ -835,10 +848,12 @@ async def _notebook_model(self, path, content=True):
model["content"] = nb
model["format"] = "json"
self.validate_notebook_model(model, validation_error)
if md5:
model["md5"] = await self._get_md5(os_path)

return model

async def get(self, path, content=True, type=None, format=None):
async def get(self, path, content=True, type=None, format=None, md5=False):
"""Takes a path for an entity and returns its model

Parameters
Expand All @@ -853,6 +868,8 @@ async def get(self, path, content=True, type=None, format=None):
format : str, optional
The requested format for file contents. 'text' or 'base64'.
Ignored if this returns a notebook or directory model.
md5: bool, optional
Whether to include the md5 of the file contents.

Returns
-------
Expand All @@ -875,11 +892,11 @@ async def get(self, path, content=True, type=None, format=None):
)
model = await self._dir_model(path, content=content)
elif type == "notebook" or (type is None and path.endswith(".ipynb")):
model = await self._notebook_model(path, content=content)
model = await self._notebook_model(path, content=content, md5=md5)
else:
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = await self._file_model(path, content=content, format=format)
model = await self._file_model(path, content=content, format=format, md5=md5)
self.emit(data={"action": "get", "path": path})
return model

Expand Down Expand Up @@ -1147,7 +1164,9 @@ async def _get_dir_size(self, path: str = ".") -> str:
).stdout.split()
else:
result = subprocess.run(
["du", "-s", "--block-size=1", path], capture_output=True, check=True
["du", "-s", "--block-size=1", path],
capture_output=True,
check=True,
).stdout.split()

self.log.info(f"current status of du command {result}")
Expand Down