PathRegexSearch module

`PathRegexSearch`

Bases: Module

Regex variable-length path search where BOTH endpoints match.

LM-driven wrapper around KnowledgeBase.path_regex_search. Returns paths of min_hops..max_hops edges whose subject endpoint matches subj_regex_search AND whose object endpoint matches obj_regex_search. Regex uses RE2, so patterns are linear-time and not vulnerable to catastrophic backtracking.

Parameters:

Name	Type	Description	Default
`knowledge_base`	`KnowledgeBase`	The knowledge base to search. Required.	`None`
`subj_schema`	`dict`	JSON schema of the subject entity. Used to infer `subj_label` from its `title` when not given explicitly. Mutually inferrable with `subj_entity_model`.	`None`
`subj_entity_model`	`Entity \| SymbolicDataModel`	Subject entity model (provides `subj_schema` / `subj_label`).	`None`
`subj_label`	`str`	Subject entity label. Optional — when neither it nor a schema to derive it from is given, the language model infers it per call (constrained to the knowledge base's entity labels).	`None`
`obj_schema`	`dict`	JSON schema of the object entity.	`None`
`obj_entity_model`	`Entity \| SymbolicDataModel`	Object entity model (provides `obj_schema` / `obj_label`).	`None`
`obj_label`	`str`	Object entity label. Optional — inferred per call like `subj_label` when not given.	`None`
`rel_label`	`str`	Optional rel-label constraint applied to every hop.	`None`
`min_hops`	`int`	Minimum hop count, inclusive. Defaults to 1.	`1`
`max_hops`	`int`	Maximum hop count, inclusive. Defaults to 3.	`3`
`k`	`int`	Maximum number of results. Defaults to 10.	`10`
`fields`	`list`	Field names to match against. Applied to both endpoints.	`None`
`case_sensitive`	`bool`	When `False`, regex matches are case-insensitive. Defaults to `True`.	`True`
`output_format`	`str`	`"json"` (default) or `"csv"`.	`'json'`
`name`	`str`	Module name.	`None`
`description`	`str`	Module description.	`None`
`trainable`	`bool`	Whether the module's variables should be trainable.	`True`

Source code in synalinks/src/modules/retrievers/path_regex_search.py

@synalinks_export(
    [
        "synalinks.modules.PathRegexSearch",
        "synalinks.PathRegexSearch",
    ]
)
class PathRegexSearch(Module):
    """Regex variable-length path search where BOTH endpoints match.

    LM-driven wrapper around
    `KnowledgeBase.path_regex_search`. Returns paths of
    ``min_hops..max_hops`` edges whose subject endpoint matches
    ``subj_regex_search`` AND whose object endpoint matches
    ``obj_regex_search``. Regex uses RE2, so patterns are linear-time
    and not vulnerable to catastrophic backtracking.

    Args:
        knowledge_base (KnowledgeBase): The knowledge base to search.
            Required.
        subj_schema (dict): JSON schema of the subject entity. Used
            to infer ``subj_label`` from its ``title`` when not given
            explicitly. Mutually inferrable with ``subj_entity_model``.
        subj_entity_model (Entity | SymbolicDataModel): Subject entity
            model (provides ``subj_schema`` / ``subj_label``).
        subj_label (str): Subject entity label. **Optional** — when
            neither it nor a schema to derive it from is given, the
            language model infers it per call (constrained to the
            knowledge base's entity labels).
        obj_schema (dict): JSON schema of the object entity.
        obj_entity_model (Entity | SymbolicDataModel): Object entity
            model (provides ``obj_schema`` / ``obj_label``).
        obj_label (str): Object entity label. **Optional** — inferred
            per call like ``subj_label`` when not given.
        rel_label (str): Optional rel-label constraint applied to
            every hop.
        min_hops (int): Minimum hop count, inclusive. Defaults to 1.
        max_hops (int): Maximum hop count, inclusive. Defaults to 3.
        k (int): Maximum number of results. Defaults to 10.
        fields (list): Field names to match against. Applied to both
            endpoints.
        case_sensitive (bool): When ``False``, regex matches are
            case-insensitive. Defaults to ``True``.
        output_format (str): ``"json"`` (default) or ``"csv"``.
        name (str): Module name.
        description (str): Module description.
        trainable (bool): Whether the module's variables should be
            trainable.
    """

    def __init__(
        self,
        *,
        knowledge_base=None,
        language_model=None,
        subj_schema=None,
        subj_entity_model=None,
        subj_label: Optional[str] = None,
        obj_schema=None,
        obj_entity_model=None,
        obj_label: Optional[str] = None,
        rel_label: Optional[str] = None,
        min_hops: int = 1,
        max_hops: int = 3,
        k: int = 10,
        fields: Optional[List[str]] = None,
        case_sensitive: bool = True,
        output_format: str = "json",
        prompt_template: Optional[str] = None,
        examples: Optional[list] = None,
        instructions: Optional[str] = None,
        seed_instructions: Optional[str] = None,
        temperature: float | None = None,
        max_tokens: int | None = None,
        top_p: float | None = None,
        top_k: int | None = None,
        use_inputs_schema: bool = False,
        use_outputs_schema: bool = False,
        return_inputs: bool = True,
        return_query: bool = True,
        name: Optional[str] = None,
        description: Optional[str] = None,
        trainable: bool = True,
    ):
        super().__init__(
            name=name,
            description=description,
            trainable=trainable,
        )
        self.knowledge_base = _get_kb(knowledge_base)
        self.language_model = _get_lm(language_model)

        self.subj_schema, self.subj_label = resolve_endpoint(
            subj_schema, subj_entity_model, subj_label, "subj"
        )
        self.subj_entity_model = subj_entity_model
        self.obj_schema, self.obj_label = resolve_endpoint(
            obj_schema, obj_entity_model, obj_label, "obj"
        )
        self.obj_entity_model = obj_entity_model
        self.rel_label = rel_label

        if min_hops < 1 or max_hops < min_hops:
            raise ValueError(
                f"Invalid hop range: min_hops={min_hops}, "
                f"max_hops={max_hops}. Require 1 <= min_hops <= max_hops."
            )
        self.min_hops = min_hops
        self.max_hops = max_hops

        if output_format not in ("json", "csv"):
            raise ValueError(
                f"`output_format` must be 'json' or 'csv', got {output_format!r}"
            )
        self.output_format = output_format

        if not isinstance(k, int) or k < 1:
            raise ValueError(f"`k` must be a positive integer, got {k!r}")
        self.k = k
        self.fields = fields
        self.case_sensitive = case_sensitive

        self.prompt_template = prompt_template
        self.examples = examples
        self.instructions = instructions
        self.seed_instructions = seed_instructions
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.top_p = top_p
        self.top_k = top_k
        self.use_inputs_schema = use_inputs_schema
        self.use_outputs_schema = use_outputs_schema
        self.return_inputs = return_inputs
        self.return_query = return_query

        # Either endpoint label may be unset; when so the LM infers it per call,
        # constrained to the KB's entity labels (concatenated onto the query).
        infer_specs = []
        if self.subj_label is None:
            infer_specs.append(
                (
                    "subj_label",
                    "The subject entity label for the path, chosen to best "
                    "answer the inputs.",
                    kb_entity_labels(self.knowledge_base),
                )
            )
        if self.obj_label is None:
            infer_specs.append(
                (
                    "obj_label",
                    "The object entity label for the path, chosen to best "
                    "answer the inputs.",
                    kb_entity_labels(self.knowledge_base),
                )
            )
        if infer_specs:
            gen_target = {
                "schema": concat_infer_fields(
                    PathRegexSearchInput.get_schema(), infer_specs
                )
            }
        else:
            gen_target = {"data_model": PathRegexSearchInput}

        self.query_generator = Generator(
            **gen_target,
            language_model=self.language_model,
            prompt_template=self.prompt_template,
            examples=self.examples,
            instructions=self.instructions,
            seed_instructions=self.seed_instructions,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            top_p=self.top_p,
            top_k=self.top_k,
            use_inputs_schema=self.use_inputs_schema,
            use_outputs_schema=self.use_outputs_schema,
            return_inputs=False,
            name="path_regex_search_query_generator_" + self.name,
        )

    async def call(self, inputs, training=False):
        if not inputs:
            return None

        query = await self.query_generator(inputs, training=training)
        if not query:
            return None
        payload = query.get_json()
        subj_pattern = payload.get("subj_regex_search")
        obj_pattern = payload.get("obj_regex_search")
        # Fixed endpoint labels, or the ones the LM inferred this call.
        subj_label = self.subj_label or payload.get("subj_label")
        obj_label = self.obj_label or payload.get("obj_label")
        if not subj_pattern or not obj_pattern or not subj_label or not obj_label:
            return None

        rows = await self.knowledge_base.path_regex_search(
            subj_pattern=subj_pattern,
            obj_pattern=obj_pattern,
            subj_label=subj_label,
            obj_label=obj_label,
            label=self.rel_label,
            min_hops=self.min_hops,
            max_hops=self.max_hops,
            k=self.k,
            fields=self.fields,
            case_sensitive=self.case_sensitive,
            output_format=self.output_format,
        )
        results = JsonDataModel(
            json={"result": rows},
            schema=GenericResult.get_schema(),
            name=self.name,
        )
        if self.return_query:
            results = await ops.logical_and(
                query,
                results,
                name="results_with_query_" + self.name,
            )
        if self.return_inputs:
            results = await ops.logical_and(
                inputs,
                results,
                name="results_with_inputs_" + self.name,
            )
        return results

    async def compute_output_spec(self, inputs, training=False):
        query = await self.query_generator(inputs, training=training)
        results = SymbolicDataModel(
            schema=GenericResult.get_schema(),
            name=self.name,
        )
        if self.return_query:
            results = await ops.logical_and(
                query,
                results,
                name="results_with_query_" + self.name,
            )
        if self.return_inputs:
            results = await ops.logical_and(
                inputs,
                results,
                name="results_with_inputs_" + self.name,
            )
        return results

    def get_config(self):
        config = {
            "subj_schema": self.subj_schema,
            "subj_label": self.subj_label,
            "obj_schema": self.obj_schema,
            "obj_label": self.obj_label,
            "rel_label": self.rel_label,
            "min_hops": self.min_hops,
            "max_hops": self.max_hops,
            "k": self.k,
            "fields": list(self.fields) if self.fields is not None else None,
            "case_sensitive": self.case_sensitive,
            "output_format": self.output_format,
            "prompt_template": self.prompt_template,
            "examples": self.examples,
            "instructions": self.instructions,
            "seed_instructions": self.seed_instructions,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "top_p": self.top_p,
            "top_k": self.top_k,
            "use_inputs_schema": self.use_inputs_schema,
            "use_outputs_schema": self.use_outputs_schema,
            "return_inputs": self.return_inputs,
            "return_query": self.return_query,
            "name": self.name,
            "description": self.description,
            "trainable": self.trainable,
        }
        knowledge_base_config = {
            "knowledge_base": serialization_lib.serialize_synalinks_object(
                self.knowledge_base,
            )
        }
        language_model_config = {
            "language_model": serialization_lib.serialize_synalinks_object(
                self.language_model,
            )
        }
        endpoint_models_config = {
            "subj_entity_model": serialize_entity_model(
                self.subj_entity_model, "subj_entity_model_" + self.name
            ),
            "obj_entity_model": serialize_entity_model(
                self.obj_entity_model, "obj_entity_model_" + self.name
            ),
        }
        return {
            **config,
            **knowledge_base_config,
            **language_model_config,
            **endpoint_models_config,
        }

    @classmethod
    def from_config(cls, config):
        knowledge_base = serialization_lib.deserialize_synalinks_object(
            config.pop("knowledge_base")
        )
        language_model = serialization_lib.deserialize_synalinks_object(
            config.pop("language_model")
        )
        subj_entity_model = deserialize_entity_model(
            config.pop("subj_entity_model", None)
        )
        obj_entity_model = deserialize_entity_model(config.pop("obj_entity_model", None))
        return cls(
            knowledge_base=knowledge_base,
            language_model=language_model,
            subj_entity_model=subj_entity_model,
            obj_entity_model=obj_entity_model,
            **config,
        )

`PathRegexSearchInput`

Bases: DataModel

Input shape for PathRegexSearch.

Source code in synalinks/src/modules/retrievers/path_regex_search.py

class PathRegexSearchInput(DataModel):
    """Input shape for `PathRegexSearch`."""

    subj_regex_search: str = Field(
        description="Regex pattern (RE2) for the subject endpoint",
    )
    obj_regex_search: str = Field(
        description="Regex pattern (RE2) for the object endpoint",
    )