Skip to content

GSM8K

GSM8KInput

Bases: DataModel

Input data model.

Source code in synalinks/src/datasets/gsm8k.py
class GSM8KInput(DataModel):
    """Input data model."""

    question: str = Field(
        description="The math word problem",
    )

GSM8KOutput

Bases: DataModel

Output data model.

Source code in synalinks/src/datasets/gsm8k.py
class GSM8KOutput(DataModel):
    """Output data model."""

    thinking: str = Field(
        description="Your step by step thinking",
    )
    answer: float = Field(
        description="The numerical answer",
    )

get_input_data_model()

Returns GSM8K input data_model for pipeline configurations.

Returns:

Type Description
DataModel

The GSM8K input data_model

Source code in synalinks/src/datasets/gsm8k.py
@synalinks_export("synalinks.datasets.gsm8k.get_input_data_model")
def get_input_data_model():
    """
    Returns GSM8K input data_model for pipeline configurations.

    Returns:
        (DataModel): The GSM8K input data_model
    """
    return GSM8KInput

get_output_data_model()

Returns GSM8K output data_model for pipeline configurations.

Returns:

Type Description
DataModel

The GSM8K output data_model

Source code in synalinks/src/datasets/gsm8k.py
@synalinks_export("synalinks.datasets.gsm8k.get_output_data_model")
def get_output_data_model():
    """
    Returns GSM8K output data_model for pipeline configurations.

    Returns:
        (DataModel): The GSM8K output data_model
    """
    return GSM8KOutput

load_data()

Load and format data from HuggingFace

Example:

(x_train, y_train), (x_test, y_test) = synalinks.datasets.gsm8k.load_data()

Returns:

Type Description
tuple

The train and test data ready for training

Source code in synalinks/src/datasets/gsm8k.py
@synalinks_export("synalinks.datasets.gsm8k.load_data")
def load_data():
    """
    Load and format data from HuggingFace

    Example:

    ```python
    (x_train, y_train), (x_test, y_test) = synalinks.datasets.gsm8k.load_data()
    ```

    Returns:
        (tuple): The train and test data ready for training
    """
    dataset = load_dataset("gsm8k", "main")

    x_train = []
    y_train = []
    x_test = []
    y_test = []

    for datapoint in dataset["train"]:
        question = datapoint["question"]
        thinking = datapoint["answer"].split("####")[0].strip()
        answer = datapoint["answer"].split("####")[-1].strip()
        x_train.append(GSM8KInput(question=question))
        y_train.append(
            GSM8KOutput(
                thinking=thinking,
                answer=float(answer.replace(",", "")),
            )
        )

    for datapoint in dataset["test"]:
        question = datapoint["question"]
        thinking = datapoint["answer"].split("####")[0].strip()
        answer = datapoint["answer"].split("####")[-1].strip()
        x_test.append(GSM8KInput(question=question))
        y_test.append(
            GSM8KOutput(
                thinking=thinking,
                answer=float(answer.replace(",", "")),
            )
        )

    x_train = np.array(x_train, dtype="object")
    y_train = np.array(y_train, dtype="object")

    x_test = np.array(x_test, dtype="object")
    y_test = np.array(y_test, dtype="object")

    return (x_train, y_train), (x_test, y_test)