Merge pull request #7 from JetBrains-Research/HumanEval-Nagini

HumanEval Nagini
JetBrains-Research · Sep 22, 2024 · 862c252 · 862c252
2 parents f9c0a9f + e9ac856
commit 862c252
Show file tree

Hide file tree

Showing 18 changed files with 147 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,4 +20,6 @@ run_nagini.py
 /tmp
 results
 results/*
+/log_tries/
+/log_tries/*
 .direnv
diff --git a/.gitmodules b/.gitmodules
@@ -5,3 +5,7 @@
 [submodule "benches/HumanEval-Dafny"]
 	path = benches/HumanEval-Dafny
 	url = https://github.com/JetBrains-Research/HumanEval-Dafny
+
+[submodule "benches/HumanEval-Nagini"]
+	path = benches/HumanEval-Nagini
+	url = https://github.com/JetBrains-Research/HumanEval-Nagini
diff --git a/benches/HumanEval-Nagini b/benches/HumanEval-Nagini
diff --git a/prompts/humaneval-nagini/add.txt b/prompts/humaneval-nagini/add.txt
@@ -0,0 +1,8 @@
+Given the following Python program, and a set of Nagini invariants, output the program with invariants inserted into the correct place.
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
+The program:
+{program}
+–
+The invariants:
+{checks}
diff --git a/prompts/humaneval-nagini/ask_for_fixed.txt b/prompts/humaneval-nagini/ask_for_fixed.txt
@@ -0,0 +1,6 @@
+The following errors occurred during verification:
+{error}
+
+Please fix the error by adding, removing or modifying the invariants and return the fixed program.
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
diff --git a/prompts/humaneval-nagini/ask_for_fixed_had_errors.txt b/prompts/humaneval-nagini/ask_for_fixed_had_errors.txt
@@ -0,0 +1,6 @@
+There are still some errors:
+{error}
+
+Could you please fix them?
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
diff --git a/prompts/humaneval-nagini/produce.txt b/prompts/humaneval-nagini/produce.txt
@@ -0,0 +1,7 @@
+Given the following Python program, output Nagini invariants that should go into the `while` loop.
+Ensure that the invariants are as comprehensive as they can be.
+Even if you think some invariant is not totally necessary, better add it than not.
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
+The program:
+{program}
diff --git a/prompts/humaneval-nagini/rewrite.txt b/prompts/humaneval-nagini/rewrite.txt
@@ -0,0 +1,42 @@
+Rewrite the following Python program, adding correct Nagini invariants into `while` loops.
+Do not change the code, only add the invariants.
+Ensure that the invariants are as comprehensive as they can be.
+Even if you think some invariant is not totally necessary, better add it than not.
+Don't add any additional text comments, your response must contain only program with invariants.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
+Also add assertions in necessary places.
+Do not change the code, only add invariants and assertions. Don't remove any helper functions, they are there to help you.
+You might need to work with accumulating functions, such as sum, so here's an example of how to do that:
+```
+from typing import cast, List, Dict, Set, Optional, Union
+from nagini_contracts.contracts import *
+
+@Pure
+def Sum(a : List[int], s : int, t : int) -> int :
+    Requires(Acc(list_pred(a)))
+    Requires(((0) <= (s)) and ((s) <= (t)) and ((t) <= (len(a))))
+
+    if s == t:
+        return 0
+    else:
+        return (a)[t - 1] + (Sum(a, s, t - 1))
+
+def sum_loop(numbers: List[int]) -> int:
+    Requires(Acc(list_pred(numbers)))
+    Ensures(Acc(list_pred(numbers)))
+    Ensures(Result() == Sum(numbers, 0, len(numbers)))
+    s = int(0)
+    i = int(0)
+    while (i) < (len(numbers)):
+        Invariant(Acc(list_pred(numbers)))
+        Invariant(0 <= i and i <= len(numbers))
+        Invariant(Forall(int, lambda d_1_p_:
+            (Implies(0 <= d_1_p_ and d_1_p_ < len(numbers), Sum(numbers, 0, d_1_p_ + 1) == Sum(numbers, 0, d_1_p_) + numbers[d_1_p_]), [[Sum(numbers, 0, d_1_p_ + 1)]])))
+        Invariant(s == Sum(numbers, 0, i))
+        Assert(Sum(numbers, 0, i + 1) == Sum(numbers, 0, i) + numbers[i])
+        s = s + (numbers)[i]
+        i = i + 1
+    return s
+```
+The program:
+{program}
diff --git a/prompts/humaneval-nagini/sys.txt b/prompts/humaneval-nagini/sys.txt
@@ -0,0 +1,4 @@
+You are an expert in a Python verification framework Nagini.
+You will be given tasks dealing with Python programs including precise docstrings and specifications.
+Do not provide ANY explanations. Don't include markdown backticks. Respond only in Python code, nothing else.
+Take into account that arrays inside the invariants are indexed by type `int`.
diff --git a/prompts/humaneval-nagini/timeout.txt b/prompts/humaneval-nagini/timeout.txt
@@ -0,0 +1,3 @@
+The verifier timed out during the verification.
+This usually means that the provided invariants were too broad or were difficult to check.
+Could you please try to improve the invariants and try again?
diff --git a/tests/test_nagini.py b/tests/test_nagini.py
@@ -11,8 +11,10 @@ def test_nagini_generate():
         def main(value: int) -> int:
             Requires(value >= 10)
             Ensures(Result() >= 20)
+            # impl-start
             Assert(value * 2 >= 20) # assert-line
-            return value * 2"""
+            return value * 2
+            # impl-end"""
     )
     assert nagini_lang.generate_validators(code) == dedent(
         """\
@@ -43,8 +45,12 @@ def main(value: int) -> int:
     assert nagini_lang.generate_validators(code) == dedent(
         """\
         def main_valid(value: int) -> int:
+            # pre-conditions-start
             Requires(value >= 10)
+            # pre-conditions-end
+            # post-conditions-start
             Ensures(Result() >= 20)
+            # post-conditions-end
             ret = main(value)
             return ret"""
     )
@@ -247,17 +253,23 @@ def alpha_valid(c : int) -> bool :
             ret = alpha(c)
             return ret
         def flip__char_valid(c : int) -> int :
+            # pre-conditions-start
             Ensures(lower(c) == upper(Result()))
             Ensures(upper(c) == lower(Result()))
+            # pre-conditions-end
             ret = flip__char(c)
             return ret
         def flip__case_valid(s : List[int]) -> List[int] :
+            # pre-conditions-start
             Requires(Acc(list_pred(s)))
+            # pre-conditions-end
+            # post-conditions-start
             Ensures(Acc(list_pred(s)))
             Ensures(Acc(list_pred(Result())))
             Ensures((len(Result())) == (len(s)))
             Ensures(Forall(int, lambda d_0_i_: (Implies(((0) <= (d_0_i_)) and ((d_0_i_) < (len(s))), lower((s)[d_0_i_]) == upper((Result())[d_0_i_])))))
             Ensures(Forall(int, lambda d_0_i_: (Implies(((0) <= (d_0_i_)) and ((d_0_i_) < (len(s))), upper((s)[d_0_i_]) == lower((Result())[d_0_i_])))))
+            # post-conditions-end
             ret = flip__case(s)
             return ret"""
     )
@@ -287,8 +299,10 @@ def flip__char(c : int) -> int :
     assert nagini_lang.generate_validators(code) == dedent(
         """\
         def flip__char_valid(c : int) -> int :
+            # pre-conditions-start
             Ensures(lower(c) == upper(Result()))
             Ensures(upper(c) == lower(Result()))
+            # pre-conditions-end
             ret = flip__char(c)
             return ret"""
     )
diff --git a/verified_cogen/args.py b/verified_cogen/args.py
@@ -23,6 +23,7 @@ class ProgramArgs:
     output_style: str
     filter_by_ext: Optional[str]
     log_tries: Optional[str]
+    output_logging: bool
 
     @no_type_check
     def __init__(self, args):
@@ -43,6 +44,7 @@ def __init__(self, args):
         self.output_style = args.output_style
         self.filter_by_ext = args.filter_by_ext
         self.log_tries = args.log_tries
+        self.output_logging = args.output_logging
 
 
 def get_default_parser():
@@ -95,6 +97,9 @@ def get_default_parser():
     parser.add_argument(
         "--log-tries", help="Save output of every try to given dir", required=False
     )
+    parser.add_argument(
+        "--output-logging", help="Print logs to standard output", default=False
+    )
     return parser
 
 

diff --git a/verified_cogen/experiments/incremental_run.py b/verified_cogen/experiments/incremental_run.py
@@ -1,5 +1,6 @@
 import logging
 import pathlib
+import sys
 import json
 
 from verified_cogen.llm.llm import LLM
@@ -15,6 +16,13 @@
 logger = logging.getLogger(__name__)
 
 
+def register_output_handler():
+    handler = logging.StreamHandler(sys.stdout)
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+
 def main():
     register_basic_languages()
 
@@ -26,6 +34,9 @@ def main():
     assert args.runs == 1
     assert args.retries == 0
 
+    if args.output_logging:
+        register_output_handler()
+
     directory = pathlib.Path(args.dir)
     log_tries = pathlib.Path(args.log_tries) if args.log_tries is not None else None
     results_directory = pathlib.Path("results")
@@ -52,8 +63,9 @@ def main():
             args.temperature,
         )
         runner = ValidatingRunner(
-            wrapping=InvariantRunner(llm, logger, verifier, log_tries),
+            wrapping=InvariantRunner(llm, logger, verifier),
             language=language,
+            log_tries=log_tries,
         )
         display_name = rename_file(file)
         marker_name = str(file.relative_to(directory))

diff --git a/verified_cogen/runners/languages/dafny.py b/verified_cogen/runners/languages/dafny.py
@@ -23,4 +23,5 @@ def __init__(self):  # type: ignore
                 r" *// invariants-start.*?// invariants-end\n",
             ],
             "// assert-line",
+            "//",
         )
diff --git a/verified_cogen/runners/languages/language.py b/verified_cogen/runners/languages/language.py
@@ -1,10 +1,10 @@
 from abc import abstractmethod
 from typing import Pattern, Any
-import re
 
 
 class Language:
     _instance = None
+    simple_comment: str
 
     def __new__(cls, *args: list[Any], **kwargs: dict[str, Any]):
         if not isinstance(cls._instance, cls):
@@ -33,14 +33,15 @@ def __init__(  # type: ignore
         validator_template: str,
         assert_invariants_pattern: list[str],
         inline_assert_comment: str,
+        simple_comment: str,
     ):
+        self.simple_comment = simple_comment
         self.method_regex = method_regex
         self.validator_template = validator_template
         self.assert_invariant_patterns = assert_invariants_pattern
         self.inline_assert_comment = inline_assert_comment
 
     def generate_validators(self, code: str) -> str:
-        code = re.sub(r"^ *#.*(\r\n|\r|\n)?", "", code, flags=re.MULTILINE)
         methods = self.method_regex.finditer(code)
 
         validators: list[str] = []

diff --git a/verified_cogen/runners/languages/nagini.py b/verified_cogen/runners/languages/nagini.py
@@ -16,7 +16,7 @@ class NaginiLanguage(GenericLanguage):
     def __init__(self):  # type: ignore
         super().__init__(
             re.compile(
-                r"def\s+(\w+)\s*\((.*?)\)\s*->\s*(.*?):(:?(?:\r\n|\r|\n)?( *(?:Requires|Ensures)\([^\r\n]*\)(?:\r\n|\r|\n)?)*)",
+                r"def\s+(\w+)\s*\((.*?)\)\s*->\s*(.*?):(.*?(\r\n|\r|\n))\s+# impl-start",
                 re.DOTALL,
             ),
             NAGINI_VALIDATOR_TEMPLATE,
@@ -25,4 +25,5 @@ def __init__(self):  # type: ignore
                 r" *# invariants-start.*?# invariants-end\n?",
             ],
             "# assert-line",
+            "#",
         )
diff --git a/verified_cogen/runners/validating.py b/verified_cogen/runners/validating.py
@@ -1,3 +1,4 @@
+import pathlib
 from typing import Optional
 from verified_cogen.runners import Runner
 from verified_cogen.runners.languages.language import Language
@@ -9,14 +10,20 @@ class ValidatingRunner(Runner):
     language: Language
     prg: Optional[str] = None
 
-    def __init__(self, wrapping: Runner, language: Language):
-        super().__init__(wrapping.llm, wrapping.logger, wrapping.verifier)
+    def __init__(
+        self,
+        wrapping: Runner,
+        language: Language,
+        log_tries: Optional[pathlib.Path] = None,
+    ):
+        super().__init__(wrapping.llm, wrapping.logger, wrapping.verifier, log_tries)
         self.wrapped_runner = wrapping
         self.language = language
 
     def _add_validators(self, prg: str, inv_prg: str):
         validators = self.language.generate_validators(prg)
-        val_prg = inv_prg + "\n// ==== verifiers ==== //\n" + validators
+        comment = self.language.simple_comment
+        val_prg = inv_prg + "\n" + comment + " ==== verifiers ==== \n" + validators
         return val_prg
 
     def preprocess(self, prg: str, mode: Mode) -> str:

diff --git a/verified_cogen/tools/verifier.py b/verified_cogen/tools/verifier.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import signal
 import subprocess
 from pathlib import Path
 from typing import Optional
@@ -15,13 +14,20 @@ def __init__(self, shell: str, verifier_cmd: str, timeout: int = 60):
         self.timeout = timeout
 
     def verify(self, file_path: Path) -> Optional[tuple[bool, str, str]]:
-        proc = subprocess.Popen(
-            [self.shell, "-i", "-l", "-c", f'{self.verifier_cmd} "{file_path}"'],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
         try:
-            out, err = proc.communicate(timeout=self.timeout)
-            return proc.returncode == 0, out.decode(), err.decode()
+            res = subprocess.run(
+                '{} -i -l -c "{} "{}""; exit'.format(
+                    self.shell, self.verifier_cmd, file_path
+                ),
+                capture_output=True,
+                shell=True,
+                timeout=self.timeout,
+            )
         except subprocess.TimeoutExpired:
-            os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
+            os.system("killall z3")
+            return None
+        return (
+            res.returncode == 0,
+            res.stdout.decode("utf-8"),
+            res.stderr.decode("utf-8"),
+        )
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,4 +23,5 @@ def __init__(self): # type: ignore @@
                     r" *// invariants-start.*?// invariants-end\n",
                 ],
                 "// assert-line",
+                "//",
             )