caikit · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 7, 2024
diff --git a/caikit_nlp/modules/text_generation/peft_tgis_remote.py b/caikit_nlp/modules/text_generation/peft_tgis_remote.py
@@ -14,7 +14,9 @@
 """This file contains a distributed backend implementation for leveraging the PEFT-trained
 prompt vectors in TGIS generation requests.
 """
+
 # Standard
+from functools import cached_property
 from typing import Iterable, List, Optional, Tuple, Union
 import os
 
@@ -32,6 +34,7 @@
     TokenizationResults,
 )
 from caikit.interfaces.nlp.tasks import TextGenerationTask, TokenizationTask
+from caikit.interfaces.runtime.data_model import RuntimeServerContextType
 from caikit_tgis_backend import TGISBackend
 import alog
 
@@ -40,6 +43,7 @@
 from ...toolkit.text_generation.tgis_utils import (
     GENERATE_FUNCTION_TGIS_ARGS,
     TGISGenerationClient,
+    get_route_info,
 )
 from ...toolkit.verbalizer_utils import render_verbalizer
 from . import PeftPromptTuning
@@ -68,15 +72,14 @@ def __init__(
         prompt_artifacts: Optional[List[str]] = None,
     ) -> None:
         super().__init__()
-        # Configure the internal client
-        # NOTE: This is made optional for the cases where we do not need to execute `.run` function
-        # for example, bootstrapping a model to caikit format and saving.
-        self._client = None
+
         self._tgis_backend = tgis_backend
         if enable_backend:
+            error.type_check(
+                "<NLP33971947E>", TGISBackend, tgis_backend=self._tgis_backend
+            )
             # get_client will also launch a local TGIS process and get the model
             # loaded when using the local TGIS backend
-            self._client = tgis_backend.get_client(base_model_name)
 
             # Tell the backend to load all of the available prompt files
             if prompt_artifacts:
@@ -107,6 +110,14 @@ def __del__(self):
             if tgis_backend and prompt_cache_id and model_id:
                 tgis_backend.unload_prompt_artifacts(model_id, prompt_cache_id)
 
+    @cached_property
+    def _client(self):
+        # Configure the internal client
+        # NOTE: This is made optional for the cases where we do not need to execute `.run` function
+        # for example, bootstrapping a model to caikit format and saving.
+        if self._tgis_backend:
+            return self._tgis_backend.get_client(self.base_model_name)
+
     @classmethod
     def load(cls, model_path: str, load_backend: BackendBase) -> "PeftPromptTuningTGIS":
         """Load a TGIS Peft Prompt Tuning distributed module. Note that we do not
@@ -182,7 +193,7 @@ def save(self, model_path: str):
             )
 
     # pylint: disable=duplicate-code
-    @TextGenerationTask.taskmethod()
+    @TextGenerationTask.taskmethod(context_arg="context")
     def run(
         self,
         text: str,
@@ -206,6 +217,7 @@ def run(
         generated_tokens: bool = True,
         token_logprobs: bool = True,
         token_ranks: bool = True,
+        context: Optional[RuntimeServerContextType] = None,
     ) -> GeneratedTextResult:
         f"""Run inference against the model running in TGIS.
 
@@ -221,6 +233,8 @@ def run(
             self.enable_backend,
             "Backend must be configured and loaded with this module before executing `run` call.",
         )
+        self._register_model_connection_with_context(context)
+
         verbalized_text = render_verbalizer(self.verbalizer, {"input": text})
         return self.tgis_generation_client.unary_generate(
             text=verbalized_text,
@@ -244,7 +258,7 @@ def run(
             stop_sequences=stop_sequences,
         )
 
-    @TextGenerationTask.taskmethod(output_streaming=True)
+    @TextGenerationTask.taskmethod(output_streaming=True, context_arg="context")
     def run_stream_out(
         self,
         text: str,
@@ -268,6 +282,7 @@ def run_stream_out(
         generated_tokens: bool = True,
         token_logprobs: bool = True,
         token_ranks: bool = True,
+        context: Optional[RuntimeServerContextType] = None,
     ) -> Iterable[GeneratedTextStreamResult]:
         f"""Run output stream inferencing against the model running in TGIS
 
@@ -283,6 +298,9 @@ def run_stream_out(
             "Backend must be configured and loaded with this module \
             before executing `run_stream_out` call.",
         )
+
+        self._register_model_connection_with_context(context)
+
         verbalized_text = render_verbalizer(self.verbalizer, {"input": text})
         return self.tgis_generation_client.stream_generate(
             text=verbalized_text,
@@ -306,10 +324,11 @@ def run_stream_out(
             stop_sequences=stop_sequences,
         )
 
-    @TokenizationTask.taskmethod()
+    @TokenizationTask.taskmethod(context_arg="context")
     def run_tokenizer(
         self,
         text: str,
+        context: Optional[RuntimeServerContextType] = None,
     ) -> TokenizationResults:
         """Run tokenization task against the model running in TGIS.
 
@@ -320,6 +339,30 @@ def run_tokenizer(
             TokenizationResults
                 The token count
         """
+
+        self._register_model_connection_with_context(context)
+
         return self.tgis_generation_client.unary_tokenize(
             text=text,
         )
+
+    def _register_model_connection_with_context(
+        self, context: Optional[RuntimeServerContextType]
+    ):
+        """
+        Register a remote model connection with the configured TGISBackend if there is
+        a context override provided.
+        """
+        if self._tgis_backend:
+            if route_info := get_route_info(context):
+                log.debug(
+                    "<NLP10705560D> Registering remote model connection with context "
+                    "override: 'hostname: %s'",
+                    route_info,
+                )
+                self._tgis_backend.register_model_connection(
+                    self.base_model_name,
+                    {"hostname": route_info},
+                    fill_with_defaults=True,
+                )
+            self._model_loaded = True
diff --git a/caikit_nlp/modules/text_generation/text_generation_tgis.py b/caikit_nlp/modules/text_generation/text_generation_tgis.py
@@ -14,6 +14,7 @@
 
 
 # Standard
+from functools import cached_property
 from typing import Iterable, List, Optional, Tuple, Union
 import os
 
@@ -30,6 +31,7 @@
     TokenizationResults,
 )
 from caikit.interfaces.nlp.tasks import TextGenerationTask, TokenizationTask
+from caikit.interfaces.runtime.data_model import RuntimeServerContextType
 from caikit_tgis_backend import TGISBackend
 import alog
 
@@ -43,6 +45,7 @@
 from ...toolkit.text_generation.tgis_utils import (
     GENERATE_FUNCTION_TGIS_ARGS,
     TGISGenerationClient,
+    get_route_info,
 )
 from .text_generation_local import TextGeneration
 
@@ -86,28 +89,33 @@ def __init__(
         # Set _model_loaded as False by default. This will only get set to True if
         # we enable the tgis_backend and we are able to fetch the client successfully.
         self._model_loaded = False
-        # Configure the internal client
-        # NOTE: This is made optional for the cases where we do not need to execute `.run` function
-        # for example, bootstrapping a model to caikit format and saving.
-        self._client = None
         if tgis_backend:
-            self._client = tgis_backend.get_client(model_name)
-            # mark that the model is loaded so that we can unload it later
-            self._model_loaded = True
             self.tgis_backend = tgis_backend
 
+        self._tgis_backend = tgis_backend
         self._bos_token = bos_token
         self._sep_token = sep_token
         self._eos_token = eos_token
         self._pad_token = pad_token
-        self.tgis_generation_client = TGISGenerationClient(
-            self.model_name, self._eos_token, self._client, self.PRODUCER_ID
-        )
 
     def __del__(self):
         # nothing to unload if we didn't finish loading
-        if self._model_loaded and self.tgis_backend:
-            self.tgis_backend.unload_model(self.model_name)
+        if self._model_loaded and self._tgis_backend:
+            self._tgis_backend.unload_model(self.model_name)
+
+    @cached_property
+    def _client(self):
+        # Lazily configure/create the internal tgis backend client
+        if self._tgis_backend:
+            return self._tgis_backend.get_client(self.model_name)
+
+    @cached_property
+    def tgis_generation_client(self):
+        # Lazily create the generation client
+        # This in turn calls self._client which also lazily gets the tgis backend client
+        return TGISGenerationClient(
+            self.model_name, self._eos_token, self._client, self.PRODUCER_ID
+        )
 
     @classmethod
     def bootstrap(cls, model_path: str, load_backend: Union[BackendBase, None] = None):
@@ -207,7 +215,7 @@ def save(self, model_path: str):
             )
 
     # pylint: disable=duplicate-code
-    @TextGenerationTask.taskmethod()
+    @TextGenerationTask.taskmethod(context_arg="context")
     def run(
         self,
         text: str,
@@ -231,6 +239,7 @@ def run(
         generated_tokens: bool = True,
         token_logprobs: bool = True,
         token_ranks: bool = True,
+        context: Optional[RuntimeServerContextType] = None,
     ) -> GeneratedTextResult:
         f"""Run inference against the model running in TGIS.
 
@@ -240,6 +249,8 @@ def run(
             GeneratedTextResult
                 Generated text result produced by TGIS.
         """
+        self._register_model_connection_with_context(context)
+
         if self._model_loaded:
             return self.tgis_generation_client.unary_generate(
                 text=text,
@@ -263,7 +274,7 @@ def run(
                 stop_sequences=stop_sequences,
             )
 
-    @TextGenerationTask.taskmethod(output_streaming=True)
+    @TextGenerationTask.taskmethod(output_streaming=True, context_arg="context")
     def run_stream_out(
         self,
         text: str,
@@ -287,6 +298,7 @@ def run_stream_out(
         generated_tokens: bool = True,
         token_logprobs: bool = True,
         token_ranks: bool = True,
+        context: Optional[RuntimeServerContextType] = None,
     ) -> Iterable[GeneratedTextStreamResult]:
         f"""Run output stream inferencing for text generation module.
 
@@ -295,6 +307,7 @@ def run_stream_out(
         Returns:
             Iterable[GeneratedTextStreamResult]
         """
+        self._register_model_connection_with_context(context)
 
         if self._model_loaded:
             return self.tgis_generation_client.stream_generate(
@@ -319,10 +332,11 @@ def run_stream_out(
                 stop_sequences=stop_sequences,
             )
 
-    @TokenizationTask.taskmethod()
+    @TokenizationTask.taskmethod(context_arg="context")
     def run_tokenizer(
         self,
         text: str,
+        context: Optional[RuntimeServerContextType] = None,
     ) -> TokenizationResults:
         """Run tokenization task against the model running in TGIS.
 
@@ -333,7 +347,28 @@ def run_tokenizer(
             TokenizationResults
                 The token count
         """
+        self._register_model_connection_with_context(context)
+
         if self._model_loaded:
             return self.tgis_generation_client.unary_tokenize(
                 text=text,
             )
+
+    def _register_model_connection_with_context(
+        self, context: Optional[RuntimeServerContextType]
+    ):
+        """
+        Register a remote model connection with the configured TGISBackend if there is
+        a context override provided.
+        """
+        if self._tgis_backend:
+            if route_info := get_route_info(context):
+                log.debug(
+                    "<NLP15770311D> Registering remote model connection with context "
+                    "override: 'hostname: %s'",
+                    route_info,
+                )
+                self._tgis_backend.register_model_connection(
+                    self.model_name, {"hostname": route_info}, fill_with_defaults=True
+                )
+            self._model_loaded = True
diff --git a/caikit_nlp/toolkit/text_generation/tgis_utils.py b/caikit_nlp/toolkit/text_generation/tgis_utils.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This file is for helper functions related to TGIS.
-"""
+"""This file is for helper functions related to TGIS."""
+
 # Standard
-from typing import Iterable
+from typing import Iterable, Optional
 
 # Third Party
+import fastapi
 import grpc
 
 # First Party
@@ -33,6 +34,7 @@
     TokenizationResults,
     TokenStreamDetails,
 )
+from caikit.interfaces.runtime.data_model import RuntimeServerContextType
 from caikit_tgis_backend.protobufs import generation_pb2
 import alog
 
@@ -84,6 +86,9 @@
     grpc.StatusCode.UNAUTHENTICATED: CaikitCoreStatusCode.UNAUTHORIZED,
 }
 
+# HTTP Header / gRPC Metadata key used to identify a route override
+ROUTE_INFO_HEADER_KEY = "x-route-info"
+
 
 def raise_caikit_core_exception(rpc_error: grpc.RpcError):
     """Helper to wrap logic of converting from grpc.RpcError ->
@@ -683,3 +688,31 @@ def unary_tokenize(
         return TokenizationResults(
             token_count=response.token_count,
         )
+
+
+def get_route_info(
+    context: Optional[RuntimeServerContextType],
+) -> Optional[str]:
+    """
+    Returns a tuple `(True, x-route-info)` from context if "x-route-info" was found in
+    the headers/metadata.
+
+    Otherwise returns a tuple `(False, None)` if "x-route-info" was not found in the
+    context or if context is None.
+    """
+    if context is None:
+        return None
+
+    if isinstance(context, grpc.ServicerContext):
+        route_info = dict(context.invocation_metadata()).get(ROUTE_INFO_HEADER_KEY)
+        if route_info:
+            return route_info
+    elif isinstance(context, fastapi.Request):
+        route_info = context.headers.get(ROUTE_INFO_HEADER_KEY)
+        if route_info:
+            return route_info
+    else:
+        error.log_raise(
+            "<NLP92615097E>",
+            ValueError(f"context is of an unsupported type: {type(context)}"),
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ classifiers=[
 ]
 dependencies = [
     "caikit[runtime-grpc,runtime-http]>=0.26.27,<0.27.0",
-    "caikit-tgis-backend>=0.1.27,<0.2.0",
+    "caikit-tgis-backend>=0.1.33,<0.2.0",
     # TODO: loosen dependencies
     "grpcio>=1.62.2", # explicitly pin grpc dependencies to a recent version to avoid pip backtracking
     "grpcio-reflection>=1.62.2",