Leaves log for failed components in e2e tests.

jiyongjung · tfx-copybara · commit 1399624c2342 · 2020-08-18T22:08:44.000-07:00
PiperOrigin-RevId: 327372570
diff --git a/tfx/experimental/templates/taxi/e2e_tests/kubeflow_e2e_test.py b/tfx/experimental/templates/taxi/e2e_tests/kubeflow_e2e_test.py
@@ -63,7 +63,8 @@ def setUp(self):
     random_id = orchestration_test_utils.random_id()
     self._pipeline_name = 'taxi-template-kubeflow-e2e-test-' + random_id
     logging.info('Pipeline: %s', self._pipeline_name)
-    self._endpoint = self._get_endpoint()
+    self._namespace = 'kubeflow'
+    self._endpoint = self._get_endpoint(self._namespace)
     self._kfp_client = kfp.Client(host=self._endpoint)
     logging.info('ENDPOINT: %s', self._endpoint)
 
@@ -136,9 +137,10 @@ def _delete_target_container_image(self):
         'gcloud', 'container', 'images', 'delete', self._target_container_image
     ])
 
-  def _get_endpoint(self):
-    output = subprocess.check_output(
-        'kubectl describe configmap inverse-proxy-config -n kubeflow'.split())
+  def _get_endpoint(self, namespace):
+    cmd = 'kubectl describe configmap inverse-proxy-config -n {}'.format(
+        namespace)
+    output = subprocess.check_output(cmd.split())
     for line in output.decode('utf-8').split('\n'):
       if line.endswith('googleusercontent.com'):
         return line
@@ -207,7 +209,10 @@ def _run_pipeline(self):
         self._endpoint,
     ])
     self.assertEqual(0, result.exit_code)
-    self._wait_until_completed(self._parse_run_id(result.output))
+    run_id = self._parse_run_id(result.output)
+    self._wait_until_completed(run_id)
+    kubeflow_test_utils.print_failure_log_for_run(self._endpoint, run_id,
+                                                  self._namespace)
 
   def _parse_run_id(self, output: str):
     run_id_lines = [
diff --git a/tfx/orchestration/kubeflow/test_utils.py b/tfx/orchestration/kubeflow/test_utils.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import datetime
+import json
 import os
 import re
 import shutil
@@ -59,8 +60,10 @@
 from tfx.types import component_spec
 from tfx.types import standard_artifacts
 from tfx.types.standard_artifacts import Model
+from tfx.utils import kube_utils
 
 
+# TODO(jiyongjung): Merge with kube_utils.PodStatus
 # Various execution status of a KFP pipeline.
 KFP_RUNNING_STATUS = 'running'
 KFP_SUCCESS_STATUS = 'succeeded'
@@ -113,7 +116,7 @@ def poll_kfp_with_retry(host: Text, run_id: Text, retry_limit: int,
     # https://github.com/kubeflow/pipelines/issues/3669
     # by wait-and-retry when ApiException is hit.
     try:
-      get_run_response = client._run_api.get_run(run_id=run_id)  # pylint: disable=protected-access
+      get_run_response = client.get_run(run_id=run_id)
     except rest.ApiException as api_err:
       # If get_run failed with ApiException, wait _POLLING_INTERVAL and retry.
       if retry_count < retry_limit:
@@ -144,6 +147,39 @@ def poll_kfp_with_retry(host: Text, run_id: Text, retry_limit: int,
     time.sleep(polling_interval)
 
 
+def print_failure_log_for_run(host: Text, run_id: Text, namespace: Text):
+  """Prints logs of failed components of a run.
+
+  Prints execution logs for failed componentsusing `logging.info`.
+  This resembles the behavior of `argo logs` but uses K8s API directly.
+  Don't print anything if the run was successful.
+
+  Args:
+    host: address of the KFP deployment.
+    run_id: id of the execution of the pipeline.
+    namespace: namespace of K8s cluster.
+  """
+  client = kfp.Client(host=host)
+  run = client.get_run(run_id=run_id)
+  workflow_manifest = json.loads(run.pipeline_runtime.workflow_manifest)
+  if kube_utils.PodPhase(
+      workflow_manifest['status']['phase']) != kube_utils.PodPhase.FAILED:
+    return
+
+  k8s_client = kube_utils.make_core_v1_api()
+  pods = [i for i in workflow_manifest['status']['nodes'] if i['type'] == 'Pod']
+  for pod in pods:
+    if kube_utils.PodPhase(pod['phase']) != kube_utils.PodPhase.FAILED:
+      continue
+    display_name = pod['displayName']
+    pod_id = pod['id']
+
+    log = k8s_client.read_namespaced_pod_log(
+        pod_id, namespace=namespace, container='main')
+    for line in log.splitlines():
+      logging.info('%s:%s', display_name, line)
+
+
 # Custom component definitions for testing purpose.
 class _HelloWorldSpec(component_spec.ComponentSpec):
   INPUTS = {}