cancer-pipeline/models/lung.py

# -*- coding: utf-8 -*-
"""
models/lung.py
This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on lung cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats.

author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University
date: 2025-10-13
"""

__version__ = "1.0.0"
__date__ = "2025-10-13"
__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"]
__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University"
__license__ = "MIT"
__ajcc_version__ = 8
__cap_version__ = "4.3.0.1"

import dspy
from typing import Literal
from pydantic import BaseModel, Field


class LungMargin(BaseModel):
    margin_category: (
        Literal["bronchial", "vascular", "parenchymal", "chest_wall", "others"] | None
    ) = Field(
        None,
        description="acceptable value for surgical margins in lung cancer. If not included in these standard margins, should be classified as others.",
    )
    margin_involved: bool
    distance: int | None = Field(
        None,
        description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null",
    )
    description: str | None


class LungLN(BaseModel):
    lymph_node_side: Literal["right", "left"] | None = Field(
        None,
        description="acceptable value for lymph node side in lung cancer. If not included in these standard sides, should be classified as None.",
    )
    lymph_node_category: (
        Literal[
            "peribronchial",
            "1",
            "2",
            "4",
            "5",
            "6",
            "8",
            "9",
            "10",
            "11",
            "12",
            "13",
            "14",
            "3a",
            "3p",
            "7",
            "others",
        ]
        | None
    ) = Field(
        None,
        description="acceptable value for lymph node categories (i.e. stations or groups) in lung cancer. If not included in these standard lymph node 'stations'/'groups' should be classified as others.",
    )
    involved: int
    examined: int
    station_name: str | None = Field(
        None, description="specify the name of the lymph node station/group here."
    )


class LungHistologicalPattern(BaseModel):
    pattern_name: (
        Literal["acinar", "lepidic", "papillary", "solid", "micropapillary", "others"]
        | None
    ) = Field(
        None, description="histological pattern of invasive non-mucinous adenocarcinoma"
    )
    pattern_percentage: int | None = Field(
        None, description="percentage of the histological pattern"
    )


class LungBiomarker(BaseModel):
    biomarker_category: Literal["ALK", "ROS1", "PDL1", "others"] | None = Field(
        None,
        description="acceptable value for biomarker categories in lung cancer. If not included in these standard categories, should be classified as others.",
    )
    expression: bool
    percentage: int | None = Field(
        None,
        description="the percentage of tumor cells showing positive expression of the biomarker, rounded to integer. if not specified, return None",
    )
    biomarker_name: str | None = Field(
        None, description="specify the name of the biomarker here."
    )


class LungCancerNonnested(dspy.Signature):
    """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""

    report: list = dspy.InputField(
        desc="this is a pathological report for lung cancer excision, separated into paragraphs."
    )
    report_jsonized: dict = dspy.InputField(
        desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
    )
    procedure: (
        Literal[
            "wedge_resection",
            "segmentectomy",
            "lobectomy",
            "completion_lobectomy",
            "sleeve_lobectomy",
            "bilobectomy",
            "pneumonectomy",
            "major_airway_resection",
            "others",
        ]
        | None
    ) = dspy.OutputField(
        desc="identify which surgery procedure was used. e.g. polypectomy"
    )
    surgical_technique: (
        Literal["open", "thoracoscopic", "robotic", "hybrid", "others"] | None
    ) = dspy.OutputField(desc="identify how the surgery was taken. e.g. thoracoscopic")
    sideness: Literal["right", "left", "midline"] | None = dspy.OutputField(
        desc="identify which side the tumor is located. e.g. right"
    )
    cancer_primary_site: (
        Literal[
            "upper_lobe",
            "middle_lobe",
            "lower_lobe",
            "main_bronchus",
            "bronchus_intermedius",
            "bronchus_lobar",
            "others",
        ]
        | None
    ) = dspy.OutputField(desc="identify the primary site of cancer. e.g. upper_lobe")
    tumor_focality: (
        Literal[
            "single_focus",
            "separate_in_same_lobe_t3",
            "separate_nodule_in_ipsilateral_t4",
            "separate_nodule_in_contralateral_m1a",
        ]
        | None
    ) = dspy.OutputField(
        desc="identify whether the tumor is single focus or multiple foci; this is important for t and m category"
    )
    histology: (
        Literal[
            "adenocarcinoma",
            "squamous_cell_carcinoma",
            "adenosquamous_carcinoma",
            "large_cell_carcinoma",
            "large_cell_neuroendocrine_carcinoma",
            "small_cell_carcinoma",
            "carcinoid_tumor",
            "sarcomatoid_carcinoma",
            "pleomorphic_carcinoma",
            "pulmonary_lymphoepithelioma_like_carcinoma",
            "mucoepidermoid_carcinoma",
            "salivary_gland_type_tumor",
            "non_small_cell_carcinoma_not_specified",
            "non_small_cell_carcinoma_with_neuroendocrine_features",
            "other",
        ]
        | None
    ) = dspy.OutputField(
        desc="identify the histological type of the cancer. e.g. adenocarcinoma"
    )
    grade: int | None = dspy.OutputField(
        desc="identify the grade of the cancer, well->1, moderate->2, poor->3, undiff->4"
    )
    lymphovascular_invasion: bool | None = dspy.OutputField(
        desc="check whether or not lymphovascular invasion is present"
    )
    perineural_invasion: bool | None = dspy.OutputField(
        desc="check whether or not perineural invasion is present"
    )
    spread_through_air_spaces_stas: bool | None = dspy.OutputField(
        desc="check whether or not spread through air spaces (STAS) is present"
    )
    visceral_pleural_invasion: bool | None = dspy.OutputField(
        desc="check whether or not visceral pleural invasion is present"
    )
    direct_invasion_of_adjacent_structures: bool | None = dspy.OutputField(
        desc="check whether or not direct invasion of adjacent structures is present"
    )
    distant_metastasis: bool | None = dspy.OutputField(
        desc="check whether or not distant metastasis is present"
    )
    treatment_effect: str | None = dspy.OutputField(
        desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None'
    )


class LungCancerStaging(dspy.Signature):
    """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""

    report: list = dspy.InputField(
        desc="this is a pathological report for lung cancer excision, separated into paragraphs."
    )
    report_jsonized: dict = dspy.InputField(
        desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
    )
    tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField(
        desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.'
    )
    pt_category: (
        Literal["tx", "tis", "t1mi", "t1a", "t1b", "t1c", "t2a", "t2b", "t3", "t4"]
        | None
    ) = dspy.OutputField(desc="identify the pt category of the tumor")
    pn_category: Literal["nx", "n0", "n1", "n2", "n3"] | None = dspy.OutputField(
        desc="identify the pn category of the tumor"
    )
    pm_category: Literal["mx", "m0", "m1a", "m1b", "m1c"] | None = dspy.OutputField(
        desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available"
    )
    stage_group: (
        Literal[
            "0",
            "ia1",
            "ia2",
            "ia3",
            "ib",
            "iia",
            "iib",
            "iiia",
            "iiib",
            "iiic",
            "iva",
            "ivb",
            "ivc",
        ]
        | None
    ) = dspy.OutputField(desc="identify the stage group of the tumor")
    ajcc_version: int | None = dspy.OutputField(
        desc="identify the ajcc version of the pathological staging"
    )


class LungCancerMargins(dspy.Signature):
    """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""

    report: list = dspy.InputField(
        desc="this is a pathological report for lung cancer excision, separated into paragraphs."
    )
    report_jsonized: dict = dspy.InputField(
        desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
    )
    margins: list[LungMargin] | None = dspy.OutputField(
        desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "bronchial", "margin_involved": true, "distance": 5}, {"margin_category": "vascular", "margin_involved": false, "distance": null}, {"margin_category": "parenchymal", "margin_involved": false, "distance": null}]. If not present, just output null for every margin"""
    )


class LungCancerLN(dspy.Signature):
    """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""

    report: list = dspy.InputField(
        desc="this is a pathological report for lung cancer excision, separated into paragraphs."
    )
    report_jsonized: dict = dspy.InputField(
        desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
    )
    regional_lymph_node: list[LungLN] | None = dspy.OutputField(
        desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "right", "lymph_node_category": "4", "involved": 2, "examined": 5}, {"lymph_node_side": "left", "lymph_node_category": "7", "involved": 0, "examined": 3}]. If not present, just output null for every lymph node"""
    )
    extranodal_extension: bool | None = dspy.OutputField(
        desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None"
    )
    maximal_ln_size: int | None = dspy.OutputField(
        desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None"
    )


class LungCancerBiomarkers(dspy.Signature):
    """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""

    report: list = dspy.InputField(
        desc="this is a pathological report for lung cancer excision, separated into paragraphs."
    )
    report_jsonized: dict = dspy.InputField(
        desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
    )
    biomarkers: list[LungBiomarker] | None = dspy.OutputField(
        desc="""return all of the examined biomarkers using immunohistochemistry techniques,like alk, ros1, pd-l1. in JSON format. example:[{"biomarker_category": "ALK", "expression": true, "percentage": 70}, {"biomarker_category": "PDL1", "expression": false, "percentage": 0}]. If not present, just output None for every biomarker"""
    )


class LungCancerOthernested(dspy.Signature):
    """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""

    report: list = dspy.InputField(
        desc="this is a pathological report for lung cancer excision, separated into paragraphs."
    )
    report_jsonized: dict = dspy.InputField(
        desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
    )
    histological_patterns: list[LungHistologicalPattern] | None = dspy.OutputField(
        desc="if the histology is invasive non-mucinous adenocarcinoma, please identify all of the histological patterns and its percentage. if not, return None"
    )