fix can't pickle bug

haesleinhuepf · haesleinhuepf · commit 628fd26d2fd7 · 2024-03-23T18:01:07.000+01:00
source: openai/human-eval#30
diff --git a/human_eval/execution.py b/human_eval/execution.py
@@ -9,68 +9,77 @@
 import signal
 import tempfile
 
+def unsafe_execute(problem, completion, result, timeout):
+
+    with create_tempdir():
+
+        # These system calls are needed when cleaning up tempdir.
+        import os
+        import shutil
+        rmtree = shutil.rmtree
+        rmdir = os.rmdir
+        chdir = os.chdir
+
+        # Disable functionalities that can make destructive changes to the test.
+        reliability_guard()
+
+        # Construct the check program and run it.
+        check_program = (
+            problem["prompt"] + completion + "\n" +
+            problem["test"] + "\n" +
+            f"check({problem['entry_point']})"
+        )
+
+        try:
+            exec_globals = {}
+            with swallow_io():
+                with time_limit(timeout):
+# WARNING
+# This program exists to execute untrusted model-generated code. Although
+# it is highly unlikely that model-generated code will do something overtly
+# malicious in response to this test suite, model-generated code may act
+# destructively due to a lack of model capability or alignment.
+# Users are strongly encouraged to sandbox this evaluation suite so that it
+# does not perform destructive actions on their host or network. For more
+# information on how OpenAI sandboxes its code, see the accompanying paper.
+# Once you have read this disclaimer and taken appropriate precautions,
+# uncomment the following line and proceed at your own risk:
+                    exec(check_program, exec_globals)
+            result.append("passed")
+        except TimeoutException:
+            result.append("timed out")
+        except BaseException as e:
+            result.append(f"failed: {e}")
+
+        # Needed for cleaning up.
+        shutil.rmtree = rmtree
+        os.rmdir = rmdir
+        os.chdir = chdir
+
 
 def check_correctness(problem: Dict, completion: str, timeout: float,
                       completion_id: Optional[int] = None) -> Dict:
     """
     Evaluates the functional correctness of a completion by running the test
-    suite provided in the problem. 
+    suite provided in the problem.
 
     :param completion_id: an optional completion ID so we can match
         the results later even if execution finishes asynchronously.
     """
 
-    def unsafe_execute():
-
-        with create_tempdir():
-
-            # These system calls are needed when cleaning up tempdir.
-            import os
-            import shutil
-            rmtree = shutil.rmtree
-            rmdir = os.rmdir
-            chdir = os.chdir
-
-            # Disable functionalities that can make destructive changes to the test.
-            reliability_guard()
-
-            # Construct the check program and run it.
-            check_program = (
-                problem["prompt"] + completion + "\n" +
-                problem["test"] + "\n" +
-                f"check({problem['entry_point']})"
-            )
-
-            try:
-                exec_globals = {}
-                with swallow_io():
-                    with time_limit(timeout):
-# WARNING
-# This program exists to execute untrusted model-generated code. Although
-# it is highly unlikely that model-generated code will do something overtly
-# malicious in response to this test suite, model-generated code may act
-# destructively due to a lack of model capability or alignment.
-# Users are strongly encouraged to sandbox this evaluation suite so that it 
-# does not perform destructive actions on their host or network. For more 
-# information on how OpenAI sandboxes its code, see the accompanying paper.
-# Once you have read this disclaimer and taken appropriate precautions, 
-# uncomment the following line and proceed at your own risk:
-#                         exec(check_program, exec_globals)
-                result.append("passed")
-            except TimeoutException:
-                result.append("timed out")
-            except BaseException as e:
-                result.append(f"failed: {e}")
-
-            # Needed for cleaning up.
-            shutil.rmtree = rmtree
-            os.rmdir = rmdir
-            os.chdir = chdir
-
     manager = multiprocessing.Manager()
     result = manager.list()
 
-    p = multiprocessing.Process(target=unsafe_execute)
+    # p = multiprocessing.Process(target=unsafe_execute)
+    p = multiprocessing.Process(
+        target=unsafe_execute,
+        args=(
+            problem,
+            completion,
+            result,
+            timeout
+        ),
+    )
     p.start()
     p.join(timeout=timeout + 1)
     if p.is_alive():
diff --git a/notebooks/samples_gpt-3.5-turbo-1106.jsonl_results.jsonl b/notebooks/samples_gpt-3.5-turbo-1106.jsonl_results.jsonl
@@ -0,0 +1,4 @@
+{"task_id": "./human-eval-bia/human_eval_bia_0.ipynb", "completion": "\n    labeled_image, num_labels = skimage.measure.label(binary_image, background=0, return_num=True)\n    return num_labels\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}
+{"task_id": "./human-eval-bia/human_eval_bia_1.ipynb", "completion": "\nfrom skimage import filters\nimport numpy as np\n\ndef apply_otsu_threshold_and_count_postiive_pixels(image):\n    threshold_value = filters.threshold_otsu(image)\n    binary_image = image > threshold_value\n    positive_pixels_count = np.sum(binary_image)\n    return binary_image, positive_pixels_count\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}
+{"task_id": "./human-eval-bia/human_eval_bia_0.ipynb", "completion": "\n    # Convert the image to uint8\n    binary_image = binary_image.astype(np.uint8)\n\n    # Perform connected component labeling\n    labeled_image, num_labels = skimage.measure.label(binary_image, return_num=True)\n\n    # Return the count of labeled objects\n    return num_labels\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}
+{"task_id": "./human-eval-bia/human_eval_bia_1.ipynb", "completion": "\nimport skimage.filters\n\ndef apply_otsu_threshold_and_count_postiive_pixels(image):\n    \"\"\"\n    Takes an image, applies Otus's threshold method to it to create a binary image and \n    counts the positive pixels.\n    \"\"\"\n    # Apply Otsu's thresholding to create a binary image\n    threshold = skimage.filters.threshold_otsu(image)\n    binary_image = image > threshold\n    \n    # Count positive pixels\n    positive_pixels_count = np.sum(binary_image)\n\n    return positive_pixels_count\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}