pypa · brettcannon · Aug 18, 2023 · Feb 6, 2023 · Feb 6, 2023 · Feb 7, 2023
diff --git a/docs/utils.rst b/docs/utils.rst
@@ -14,7 +14,7 @@ Reference
 
     A :class:`typing.NewType` of :class:`str`, representing a normalized name.
 
-.. function:: canonicalize_name(name)
+.. function:: canonicalize_name(name, validate=False)
 
     This function takes a valid Python package or extra name, and returns the
     normalized form of it.
@@ -23,7 +23,13 @@ Reference
     checkers to help require that a string has passed through this function
     before use.
 
+    If **validate** is true, then the function will check if **name** is a valid
+    distribution name before normalizing.
+
     :param str name: The name to normalize.
+    :param bool validate: Check whether the name is a valid distribution name.
+    :raises InvalidName: If **validate** is true and the name is not an
+        acceptable distribution name.
 
     .. doctest::
 
@@ -35,6 +41,21 @@ Reference
         >>> canonicalize_name("requests")
         'requests'
 
+.. function:: is_normalized_name(name)
+
+    Check if a name is already normalized (i.e. :func:`canonicalize_name` would
+    roundtrip to the same value).
+
+    :param str name: The name to check.
+
+    .. doctest::
+
+        >>> from packaging.utils import is_normalized_name
+        >>> is_normalized_name("requests")
+        True
+        >>> is_normalized_name("Django")
+        False
+
 .. function:: canonicalize_version(version)
 
     This function takes a string representing a package version (or a
@@ -103,6 +124,9 @@ Reference
         >>> ver == Version('1.0')
         True
 
+.. exception:: InvalidName
+
+    Raised when a distribution name is invalid.
 
 .. exception:: InvalidWheelFilename
 

diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py
@@ -5,23 +5,39 @@
 import email.policy
 import sys
 import typing
-from typing import Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, Union, cast
+
+from . import requirements, specifiers, utils, version as version_module
+
+T = typing.TypeVar("T")
 
 if sys.version_info >= (3, 8):  # pragma: no cover
-    from typing import TypedDict
+    from typing import Literal, TypedDict
 else:  # pragma: no cover
     if typing.TYPE_CHECKING:
-        from typing_extensions import TypedDict
+        from typing_extensions import Literal, TypedDict
     else:
         try:
-            from typing_extensions import TypedDict
+            from typing_extensions import Literal, TypedDict
         except ImportError:
 
+            class Literal:
+                def __init_subclass__(*_args, **_kwargs):
+                    pass
+
             class TypedDict:
                 def __init_subclass__(*_args, **_kwargs):
                     pass
 
 
+class InvalidMetadata(ValueError):
+    """A metadata field contains invalid data."""
+
+    def __init__(self, field: str, message: str) -> None:
+        self.field = field
+        super().__init__(message)
+
+
 # The RawMetadata class attempts to make as few assumptions about the underlying
 # serialization formats as possible. The idea is that as long as a serialization
 # formats offer some very basic primitives in *some* way then we can support
@@ -110,7 +126,7 @@ class RawMetadata(TypedDict, total=False):
     "version",
 }
 
-_LIST_STRING_FIELDS = {
+_LIST_FIELDS = {
     "classifiers",
     "dynamic",
     "obsoletes",
@@ -125,6 +141,10 @@ class RawMetadata(TypedDict, total=False):
     "supported_platforms",
 }
 
+_DICT_FIELDS = {
+    "project_urls",
+}
+
 
 def _parse_keywords(data: str) -> List[str]:
     """Split a string of comma-separate keyboards into a list of keywords."""
@@ -230,6 +250,7 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str:
     "supported-platform": "supported_platforms",
     "version": "version",
 }
+_RAW_TO_EMAIL_MAPPING = {raw: email for email, raw in _EMAIL_TO_RAW_MAPPING.items()}
 
 
 def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[str]]]:
@@ -349,7 +370,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st
         # If this is one of our list of string fields, then we can just assign
         # the value, since email *only* has strings, and our get_all() call
         # above ensures that this is a list.
-        elif raw_name in _LIST_STRING_FIELDS:
+        elif raw_name in _LIST_FIELDS:
             raw[raw_name] = value
         # Special Case: Keywords
         # The keywords field is implemented in the metadata spec as a str,
@@ -406,3 +427,193 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st
     # way this function is implemented, our `TypedDict` can only have valid key
     # names.
     return cast(RawMetadata, raw), unparsed
+
+
+_NOT_FOUND = object()
+
+# "2.0" is technically invalid, but people used it while waiting for "2.1".
+_MetadataVersion = Literal["1.0", "1.1", "1.2", "2.0", "2.1", "2.2", "2.3"]
+
+
+class _Validator(Generic[T]):
+    name: str
+    raw_name: str
+    added: _MetadataVersion
+
+    def __init__(
+        self,
+        *,
+        added: _MetadataVersion = "1.0",
+    ) -> None:
+        self.added = added
+
+    def __set_name__(self, _owner: "Metadata", name: str) -> None:
+        self.name = name
+        self.raw_name = _RAW_TO_EMAIL_MAPPING[name]
+
+    def __get__(self, instance: "Metadata", _owner: type["Metadata"]) -> T:
+        # With Python 3.8, the caching can be replaced with functools.cached_property().
+        cache = instance.__dict__
+        value = cache.get(self.name, _NOT_FOUND)
+        if value is _NOT_FOUND:
+            try:
+                value = instance._raw[self.name]  # type: ignore[literal-required]
+            except KeyError:
+                if self.name in _STRING_FIELDS:
+                    value = ""
+                elif self.name in _LIST_FIELDS:
+                    value = []
+                elif self.name in _DICT_FIELDS:
+                    value = {}
+                else:
+                    raise InvalidMetadata(
+                        self.raw_name, f"unrecognized field: {self.raw_name!r}"
+                    )
+            try:
+                converter: Callable[[Any], T] = getattr(self, f"_process_{self.name}")
+            except AttributeError:
+                pass
+            else:
+                value = converter(value)
+            cache[self.name] = value
+            try:
+                del instance._raw[self.name]  # type: ignore[misc]
+            except KeyError:
+                pass
+        return cast(T, value)
+
+    def _invalid_metadata(self, msg: str) -> InvalidMetadata:
+        return InvalidMetadata(
+            self.raw_name, msg.format_map({"field": repr(self.raw_name)})
+        )
+
+    def _process_name(self, value: str) -> utils.NormalizedName:
+        if not value:
+            raise self._invalid_metadata("{field} is a required field")
+        return utils.canonicalize_name(value, validate=True)
+
+    def _process_version(self, value: str) -> version_module.Version:
+        if not value:
+            raise self._invalid_metadata("{field} is a required field")
+        return version_module.parse(value)
+
+    def _process_summary(self, value: str) -> str:
+        """Check the field contains no newlines."""
+        if "\n" in value:
+            raise self._invalid_metadata("{field} must be a single line")
+        return value
+
+    def _process_metadata_version(self, value: str) -> _MetadataVersion:
+        if value not in {"1.0", "1.1", "1.2", "2.0", "2.1", "2.2", "2.3"}:
+            raise self._invalid_metadata(f"{value!r} is not a valid metadata version")
+        return cast(_MetadataVersion, value)
+
+    def _process_description_content_type(self, value: str) -> str:
+        content_types = {"text/plain", "text/x-rst", "text/markdown"}
+        message = email.message.EmailMessage()
+        message["content-type"] = value
+        content_type, parameters = (
+            message.get_content_type(),  # Defaults to `text/plain` if not parseable.
+            message["content-type"].params,
+        )
+        # Check if content-type is valid or defaulted to `text/plain` and thus was
+        # not parseable.
+        if content_type not in content_types or content_type not in value:
+            raise self._invalid_metadata(
+                f"{{field}} must be one of {list(content_types)}, not {value!r}"
+            )
+
+        charset = parameters.get("charset", "UTF-8")
+        if charset != "UTF-8":
+            raise self._invalid_metadata(
+                f"{{field}} can only specify the UTF-8 charset, not {list(charset)}"
+            )
+
+        markdown_variants = {"GFM", "CommonMark"}
+        variant = parameters.get("variant", "GFM")  # Use an acceptable default.
+        if content_type == "text/markdown" and variant not in markdown_variants:
+            raise self._invalid_metadata(
+                f"valid Markdown variants for {{field}} are {list(markdown_variants)}, "
+                f"not {variant!r}",
+            )
+        return value
+
+    def _process_dynamic(self, value: List[str]) -> List[str]:
+        for dynamic_field in map(str.lower, value):
+            if dynamic_field in {"name", "version", "metadata-version"}:
+                raise self._invalid_metadata(
+                    f"{value!r} is not allowed as a dynamic field"
+                )
+            elif dynamic_field not in _EMAIL_TO_RAW_MAPPING:
+                raise self._invalid_metadata(f"{value!r} is not a valid dynamic field")
+        return list(map(str.lower, value))
+
+    @staticmethod
+    def _process_provides_extra(
+        value: List[str],
+    ) -> List[utils.NormalizedName]:
+        return [utils.canonicalize_name(name, validate=True) for name in value]
+
+    _process_requires_python = staticmethod(specifiers.SpecifierSet)
+
+    @staticmethod
+    def _process_requires_dist(
+        value: List[str],
+    ) -> List[requirements.Requirement]:
+        return list(map(requirements.Requirement, value))
+
+
+class Metadata:
+    _raw: RawMetadata
+
+    @classmethod
+    def from_raw(cls, data: RawMetadata) -> "Metadata":
+        ins = cls()
+        ins._raw = data
+        return ins
+
+    @classmethod
+    def from_email(cls, data: Union[bytes, str]) -> "Metadata":
+        """Parse metadata from an email message."""
+        raw, unparsed = parse_email(data)
+        return cls.from_raw(raw)
+        # TODO Check `unparsed` for valid keys?
+
+    # TODO Check that fields are specified in a valid metadata version?
+    #      Should that be done per-field, or should it be done in a more
+    #      over-arching validate() method?
+
+    metadata_version: _Validator[_MetadataVersion] = _Validator()
+    name: _Validator[utils.NormalizedName] = _Validator()
+    version: _Validator[version_module.Version] = _Validator()
+    dynamic: _Validator[List[str]] = _Validator(
+        added="2.2",
+    )
+    platforms: _Validator[str] = _Validator()
+    supported_platforms: _Validator[List[str]] = _Validator(added="1.1")
+    summary: _Validator[str] = _Validator()
+    description: _Validator[str] = _Validator()  # TODO 2.1: can be in body
+    # TODO are the various parts of description_content_type case-insensitive?
+    description_content_type: _Validator[str] = _Validator(added="2.1")
+    keywords: _Validator[List[str]] = _Validator()
+    home_page: _Validator[str] = _Validator()
+    download_url: _Validator[str] = _Validator(added="1.1")
+    author: _Validator[str] = _Validator()
+    author_email: _Validator[str] = _Validator()
+    maintainer: _Validator[str] = _Validator(added="1.2")
+    maintainer_email: _Validator[str] = _Validator(added="1.2")
+    license: _Validator[str] = _Validator()
+    classifiers: _Validator[List[str]] = _Validator(added="1.1")
+    requires_dist: _Validator[List[requirements.Requirement]] = _Validator(added="1.2")
+    requires_python: _Validator[specifiers.SpecifierSet] = _Validator(added="1.2")
+    # Because `Requires-External` allows for non-PEP 440 version specifiers, we
+    # don't do any processing on the values.
+    requires_external: _Validator[List[str]] = _Validator(added="1.2")
+    project_urls: _Validator[Dict[str, str]] = _Validator(added="1.2")
+    # PEP 685 lets us raise an error if an extra doesn't pass `Name` validation
+    # regardless of metadata version.
+    provides_extra: _Validator[List[utils.NormalizedName]] = _Validator(
+        added="2.1",
+    )
+    provides_dist: _Validator[List[str]] = _Validator(added="1.2")
+    obsoletes_dist: _Validator[List[str]] = _Validator(added="1.2")
diff --git a/src/packaging/utils.py b/src/packaging/utils.py
@@ -12,6 +12,12 @@
 NormalizedName = NewType("NormalizedName", str)
 
 
+class InvalidName(ValueError):
+    """
+    An invalid distribution name; users should refer to the packaging user guide.
+    """
+
+
 class InvalidWheelFilename(ValueError):
     """
     An invalid wheel filename was found, users should refer to PEP 427.
@@ -24,17 +30,28 @@ class InvalidSdistFilename(ValueError):
     """
 
 
+# Core metadata spec for `Name`
+_validate_regex = re.compile(
+    r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", re.IGNORECASE
+)
 _canonicalize_regex = re.compile(r"[-_.]+")
+_normalized_regex = re.compile(r"^([a-z0-9]|[a-z0-9]([a-z0-9-](?!--))*[a-z0-9])$")
 # PEP 427: The build number must start with a digit.
 _build_tag_regex = re.compile(r"(\d+)(.*)")
 
 
-def canonicalize_name(name: str) -> NormalizedName:
+def canonicalize_name(name: str, validate: bool = False) -> NormalizedName:
+    if validate and not _validate_regex.match(name):
+        raise InvalidName(f"name is invalid: {name!r}")
     # This is taken from PEP 503.
     value = _canonicalize_regex.sub("-", name).lower()
     return cast(NormalizedName, value)
 
 
+def is_normalized_name(name: str) -> bool:
+    return _normalized_regex.match(name) is not None
+
+
 def canonicalize_version(
     version: Union[Version, str], *, strip_trailing_zero: bool = True
 ) -> str:
@@ -100,7 +117,7 @@ def parse_wheel_filename(
 
     parts = filename.split("-", dashes - 2)
     name_part = parts[0]
-    # See PEP 427 for the rules on escaping the project name
+    # See PEP 427 for the rules on escaping the project name.
     if "__" in name_part or re.match(r"^[\w\d._]*$", name_part, re.UNICODE) is None:
         raise InvalidWheelFilename(f"Invalid project name: {filename}")
     name = canonicalize_name(name_part)