|
9 | 9 | import signal
|
10 | 10 | import tempfile
|
11 | 11 |
|
| 12 | +def unsafe_execute(problem, completion, result, timeout): |
| 13 | + |
| 14 | + with create_tempdir(): |
| 15 | + |
| 16 | + # These system calls are needed when cleaning up tempdir. |
| 17 | + import os |
| 18 | + import shutil |
| 19 | + rmtree = shutil.rmtree |
| 20 | + rmdir = os.rmdir |
| 21 | + chdir = os.chdir |
| 22 | + |
| 23 | + # Disable functionalities that can make destructive changes to the test. |
| 24 | + reliability_guard() |
| 25 | + |
| 26 | + # Construct the check program and run it. |
| 27 | + check_program = ( |
| 28 | + problem["prompt"] + completion + "\n" + |
| 29 | + problem["test"] + "\n" + |
| 30 | + f"check({problem['entry_point']})" |
| 31 | + ) |
| 32 | + |
| 33 | + try: |
| 34 | + exec_globals = {} |
| 35 | + with swallow_io(): |
| 36 | + with time_limit(timeout): |
| 37 | +# WARNING |
| 38 | +# This program exists to execute untrusted model-generated code. Although |
| 39 | +# it is highly unlikely that model-generated code will do something overtly |
| 40 | +# malicious in response to this test suite, model-generated code may act |
| 41 | +# destructively due to a lack of model capability or alignment. |
| 42 | +# Users are strongly encouraged to sandbox this evaluation suite so that it |
| 43 | +# does not perform destructive actions on their host or network. For more |
| 44 | +# information on how OpenAI sandboxes its code, see the accompanying paper. |
| 45 | +# Once you have read this disclaimer and taken appropriate precautions, |
| 46 | +# uncomment the following line and proceed at your own risk: |
| 47 | + exec(check_program, exec_globals) |
| 48 | + result.append("passed") |
| 49 | + except TimeoutException: |
| 50 | + result.append("timed out") |
| 51 | + except BaseException as e: |
| 52 | + result.append(f"failed: {e}") |
| 53 | + |
| 54 | + # Needed for cleaning up. |
| 55 | + shutil.rmtree = rmtree |
| 56 | + os.rmdir = rmdir |
| 57 | + os.chdir = chdir |
| 58 | + |
12 | 59 |
|
13 | 60 | def check_correctness(problem: Dict, completion: str, timeout: float,
|
14 | 61 | completion_id: Optional[int] = None) -> Dict:
|
15 | 62 | """
|
16 | 63 | Evaluates the functional correctness of a completion by running the test
|
17 |
| - suite provided in the problem. |
| 64 | + suite provided in the problem. |
18 | 65 |
|
19 | 66 | :param completion_id: an optional completion ID so we can match
|
20 | 67 | the results later even if execution finishes asynchronously.
|
21 | 68 | """
|
22 | 69 |
|
23 |
| - def unsafe_execute(): |
24 |
| - |
25 |
| - with create_tempdir(): |
26 |
| - |
27 |
| - # These system calls are needed when cleaning up tempdir. |
28 |
| - import os |
29 |
| - import shutil |
30 |
| - rmtree = shutil.rmtree |
31 |
| - rmdir = os.rmdir |
32 |
| - chdir = os.chdir |
33 |
| - |
34 |
| - # Disable functionalities that can make destructive changes to the test. |
35 |
| - reliability_guard() |
36 |
| - |
37 |
| - # Construct the check program and run it. |
38 |
| - check_program = ( |
39 |
| - problem["prompt"] + completion + "\n" + |
40 |
| - problem["test"] + "\n" + |
41 |
| - f"check({problem['entry_point']})" |
42 |
| - ) |
43 |
| - |
44 |
| - try: |
45 |
| - exec_globals = {} |
46 |
| - with swallow_io(): |
47 |
| - with time_limit(timeout): |
48 |
| -# WARNING |
49 |
| -# This program exists to execute untrusted model-generated code. Although |
50 |
| -# it is highly unlikely that model-generated code will do something overtly |
51 |
| -# malicious in response to this test suite, model-generated code may act |
52 |
| -# destructively due to a lack of model capability or alignment. |
53 |
| -# Users are strongly encouraged to sandbox this evaluation suite so that it |
54 |
| -# does not perform destructive actions on their host or network. For more |
55 |
| -# information on how OpenAI sandboxes its code, see the accompanying paper. |
56 |
| -# Once you have read this disclaimer and taken appropriate precautions, |
57 |
| -# uncomment the following line and proceed at your own risk: |
58 |
| -# exec(check_program, exec_globals) |
59 |
| - result.append("passed") |
60 |
| - except TimeoutException: |
61 |
| - result.append("timed out") |
62 |
| - except BaseException as e: |
63 |
| - result.append(f"failed: {e}") |
64 |
| - |
65 |
| - # Needed for cleaning up. |
66 |
| - shutil.rmtree = rmtree |
67 |
| - os.rmdir = rmdir |
68 |
| - os.chdir = chdir |
69 |
| - |
70 | 70 | manager = multiprocessing.Manager()
|
71 | 71 | result = manager.list()
|
72 | 72 |
|
73 |
| - p = multiprocessing.Process(target=unsafe_execute) |
| 73 | + # p = multiprocessing.Process(target=unsafe_execute) |
| 74 | + p = multiprocessing.Process( |
| 75 | + target=unsafe_execute, |
| 76 | + args=( |
| 77 | + problem, |
| 78 | + completion, |
| 79 | + result, |
| 80 | + timeout |
| 81 | + ), |
| 82 | + ) |
74 | 83 | p.start()
|
75 | 84 | p.join(timeout=timeout + 1)
|
76 | 85 | if p.is_alive():
|
|
0 commit comments