Skip to content

GSM8K

get_input_data_model()

Returns GSM8K input data_model for pipeline configurations.

Returns:

Type Description
DataModel

The GSM8K input data_model

Source code in synalinks/src/datasets/gsm8k.py
@synalinks_export("synalinks.datasets.gsm8k.get_input_data_model")
def get_input_data_model():
    """
    Returns GSM8K input data_model for pipeline configurations.

    Returns:
        (DataModel): The GSM8K input data_model
    """
    return MathQuestion

get_output_data_model()

Returns GSM8K output data_model for pipeline configurations.

Returns:

Type Description
DataModel

The GSM8K output data_model

Source code in synalinks/src/datasets/gsm8k.py
@synalinks_export("synalinks.datasets.gsm8k.get_output_data_model")
def get_output_data_model():
    """
    Returns GSM8K output data_model for pipeline configurations.

    Returns:
        (DataModel): The GSM8K output data_model
    """
    return NumericalAnswerWithThinking

load_data()

Load and format data from HuggingFace

Example:

(x_train, y_train), (x_test, y_test) = synalinks.datasets.gsm8k.load_data()

Returns:

Type Description
tuple

The train and test data ready for training

Source code in synalinks/src/datasets/gsm8k.py
@synalinks_export("synalinks.datasets.gsm8k.load_data")
def load_data():
    """
    Load and format data from HuggingFace

    Example:

    ```python
    (x_train, y_train), (x_test, y_test) = synalinks.datasets.gsm8k.load_data()
    ```

    Returns:
        (tuple): The train and test data ready for training
    """
    dataset = load_dataset("gsm8k", "main")

    x_train = []
    y_train = []
    x_test = []
    y_test = []

    for data_point in dataset["train"]:
        question = data_point["question"]
        thinking = data_point["answer"].split("####")[0].strip()
        answer = data_point["answer"].split("####")[-1].strip()
        x_train.append(MathQuestion(question=question))
        y_train.append(
            NumericalAnswerWithThinking(
                thinking=thinking,
                answer=float(answer.replace(",", "")),
            )
        )

    for data_point in dataset["test"]:
        question = data_point["question"]
        thinking = data_point["answer"].split("####")[0].strip()
        answer = data_point["answer"].split("####")[-1].strip()
        x_test.append(MathQuestion(question=question))
        y_test.append(
            NumericalAnswerWithThinking(
                thinking=thinking,
                answer=float(answer.replace(",", "")),
            )
        )

    x_train = np.array(x_train, dtype="object")
    y_train = np.array(y_train, dtype="object")

    x_test = np.array(x_test, dtype="object")
    y_test = np.array(y_test, dtype="object")

    return (x_train, y_train), (x_test, y_test)