Challenges in reproducing the HumanEval scores reported in the paper with BigCode's Eval Harness

#2
by Nadav-Timor - opened

Would appreciate your guidance in reproducing the HumanEval scores reported in the paper. Using BigCode's Eval Harness, we got

"humaneval": {
    "pass@1": 5.000000000000004e-05,
    "pass@10": 0.0005000000000000004,
    "pass@100": 0.005
  } 

(params used: --limit=100 --max_length_generation=256 --temperature=0.2 --n_samples=200 --batch_size=200)

Here is an input from HumanEval followed by code generated by the model: (The part of the generated code starts immediately after the end of the docstring.)

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """r agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.client.impl.protocol.task.map;

import com.hazelcast.client.impl.protocol.ClientMessage;
import com.hazelcast.client.impl.protocol.codec.MapMessageCodec;
import com.hazelcast.client.impl.protocol.task.AbstractMessageTask;
import com.hazelcast.client
Click here for additional examples:
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.internal.serialization;

import com.hazelcast.internal.serialization.impl.AbstractSerializationService;
import com.hazelcast.internal.serialization.impl.SerializationService;
import com.hazelcast.internal.serialization.impl.SerializationServiceImpl;
import com.hazelcast.internal.serialization.impl.
def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """iew;
import android.widget.Button;
import android.widget.EditText;
import android.widget.TextView;
import android.widget.Toast;

import com.example.android.sunshine.R;
import com.example.android.sunshine.activity.MainActivity;
import com.example.android.sunshine.activity.MainActivity.MainActivityListener;
import com.example.android.sunshine.activity.MainActivity.MainActivityListener.MainActivityListener;
import com.example.android.sunshine.activity.MainActivity.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.MainActivityListener.
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """ndroid.sunshine.model.User.UserType;
import com.example.android.sunshine.model.User.UserType.UserTypeType;

public class MainActivity extends AppCompatActivity {

    

@Override
	
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);

        Button button = (Button) findViewById(R.id.button);
        button.setOnClickListener(new View.OnClickListener() {
           
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference between each
    element and a centerpoint (mean in this case):
    MAD = average | x - x_mean |
    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
    1.0
    """AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.example.android.util;

import android.content.Context;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.graphics.Canvas;
import android.graphics.Color;
import android.graphics.Paint;
import android.graphics.Rect;
import android.graphics.drawable.Drawable;
import android.graphics.drawable.
from typing import List


def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    """ Insert a number 'delimeter' between every two consecutive elements of input list `numbers'
    >>> intersperse([], 4)
    []
    >>> intersperse([1, 2, 3], 4)
    [1, 4, 2, 4, 3]
    """icenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.internal.serialization;

import com.hazelcast.internal.serialization.impl.AbstractSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceBuilder;
import com.hazelcast.internal.serialization.
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output the deepest level of nesting of parentheses.
    E.g. (()()) has maximum two levels of nesting while ((())) has three.

    >>> parse_nested_parens('(()()) ((())) () ((())()())')
    [2, 3, 1, 3]
    """AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.internal.serialization;

import com.hazelcast.internal.serialization.impl.DefaultSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceFactory;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceFactoryBuilder;
import com.hazelcast.internal.serial
from typing import List


def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """y applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.internal.partition;

import com.hazelcast.internal.partition.impl.PartitionServiceImpl;
import com.hazelcast.internal.partition.impl.PartitionServicePartitionImpl;
import com.hazelcast.internal.partition.impl.PartitionServicePartitionService;
import com.hazelcast.internal.partition.impl.PartitionService

Note that the results above are the first we got (no cherry-picking).

Thanks!

Nadav-Timor changed discussion title from Issues to reproduce the HumanEval scores reported in the paper to Challenges in Reproducing the HumanEval Scores Reported in the Paper
Nadav-Timor changed discussion title from Challenges in Reproducing the HumanEval Scores Reported in the Paper to Challenges in reproducing the HumanEval scores reported in the paper

Hi there, please refer to our released scripts (see here) to reproduce the HumanEval results.

Nadav-Timor changed discussion title from Challenges in reproducing the HumanEval scores reported in the paper to Challenges in reproducing the HumanEval scores reported in the paper with BigCode Eval Harness
Nadav-Timor changed discussion title from Challenges in reproducing the HumanEval scores reported in the paper with BigCode Eval Harness to Challenges in reproducing the HumanEval scores reported in the paper with BigCode's Eval Harness

Hi @yuewang-sf , thanks for the link.
Did you save the model's raw completions you compared with OpenAI's ground truth?

Would appreciate your guidance in reproducing the HumanEval scores reported in the paper. Using BigCode's Eval Harness, we got

"humaneval": {
    "pass@1": 5.000000000000004e-05,
    "pass@10": 0.0005000000000000004,
    "pass@100": 0.005
  } 

(params used: --limit=100 --max_length_generation=256 --temperature=0.2 --n_samples=200 --batch_size=200)

Using the same codebase and execution params, Salesforce/codegen-350M-mono received:

{
  "humaneval": {
    "pass@1": 0.21515,
    "pass@10": 0.3201372339273634,
    "pass@100": 0.3812360762120558
  }

Hi Nadav, please refer to our response to this issue at our CodeT5 repo. We do save the model's raw completions and can release it soon.

Thanks @yuewang-sf !

Do you happen to know if Hugging Face's Inference API is compatible with your method? In my try below, I got the same result as I mentioned in my first message.

import requests

API_URL = "https://api-inference.huggingface.co/models/Salesforce/codet5p-220m"
headers = {"Authorization": "Bearer xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

humaneval_input_1 = """from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    \"\"\""""
    
output = query({
    "inputs": humaneval_input_1,
    "parameters": {
        "temperature": 0.2,
        "max_length": 256,
    },
    "options": {
        "wait_for_model": True
    }
})

Then, print(output[0]["generated_text"]) prints

/*
 * Copyright (c) 2008-2021, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.internal.serialization;

import com.hazelcast.internal.serialization.impl.AbstractSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationService;
import com.hazelcast.internal.serialization.impl.DefaultSerializationServiceFactory;
import com.hazelcast.internal.serialization.

Hi Nadav, please refer to our response to this issue at our CodeT5 repo. We do save the model's raw completions and can release it soon.

Our issue is with Salesforce/codet5p-220m and Salesforce/codet5p-220m-py. We'd appreciate it if you could share raw generations. Thank you!

Sign up or log in to comment