Files
cancer-pipeline/models/prostate.py
2025-11-30 16:46:59 -05:00

224 lines
10 KiB
Python

# -*- coding: utf-8 -*-
"""
models/prostate.py
This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on prostate cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats.
author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University
date: 2025-10-13
"""
__version__ = "1.0.0"
__date__ = "2025-10-13"
__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"]
__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University"
__license__ = "MIT"
__ajcc_version__ = 8
__cap_version__ = "4.3.0.0"
import dspy
from typing import Literal
from pydantic import BaseModel, Field
class ProstateLN(BaseModel):
lymph_node_side: Literal["right", "left", "midline"] | None = Field(
None,
description="acceptable value for lymph node side in cervical cancer. If not included in these standard sides, should be classified as None.",
)
lymph_node_category: (
Literal[
"hypogastric",
"obturator",
"external_iliac",
"internal_iliac",
"common_iliac",
"iliac_nos",
"pelvic_nos",
"others",
]
| None
) = Field(
None,
description="acceptable value for lymph node categories (i.e. stations) in prostate cancer. If not included in these standard lymph node 'station' number, should be classified as others.",
)
involved: int
examined: int
station_name: str | None = Field(
None, description="specify the name of the lymph node group/station here."
)
class ProstateCancerNonnested(dspy.Signature):
"""you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for lung cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
procedure: Literal["radical_prostatectomy", "others"] | None = dspy.OutputField(
desc="identify which surgery procedure was used. e.g. radical prostatectomy"
)
surgical_technique: Literal["open", "robotic", "hybrid", "others"] | None = (
dspy.OutputField(desc="identify how the surgery was taken. e.g. robotic")
)
prostate_weight: int | None = dspy.OutputField(
desc="identify the weight of the prostate in grams. e.g. 50"
)
prostate_size: int | None = dspy.OutputField(
desc="identify the size of the prostate in mm, largest dimension. e.g. 45 means 45mm"
)
histology: (
Literal[
"acinar_adenocarcinoma",
"intraductal_carcinoma",
"ductal_adenocarcinoma",
"mixed_acinar_ductal",
"neuroendocrine_carcinoma_small_cell",
"others",
]
| None
) = dspy.OutputField(
desc="identify the histological type of the cancer. e.g. acinar adenocarcinoma"
)
grade: (
Literal[
"group_1_3_3",
"group_2_3_4",
"group_3_4_3",
"group_4_4_4",
"group_5_4_5",
"group_5_5_4",
"group_5_5_5",
]
| None
) = dspy.OutputField(
desc="identify the gleason group of the cancer. e.g. group_2_3_4 means gleason score 7 (3+4)"
)
gleason_4_percentage: int | None = dspy.OutputField(
desc="identify the percentage of gleason pattern 4. e.g. 20 means 20%"
)
gleason_5_percentage: int | None = dspy.OutputField(
desc="identify the percentage of gleason pattern 5. e.g. 0 means 0%"
)
intraductal_carcinoma_presence: bool | None = dspy.OutputField(
desc="check whether or not intraductal carcinoma is present"
)
cribriform_pattern_presence: bool | None = dspy.OutputField(
desc="check whether or not cribriform pattern is present. only in gleason score 7 or 8"
)
tumor_percentage: int | None = dspy.OutputField(
desc="identify the percentage of tumor in the prostate in both lobes. e.g. 30 means 30%"
)
tumor_size: int | None = dspy.OutputField(
desc="identify the size of the tumor in mm, largest dimension. e.g. 15 means 15mm"
)
extraprostatic_extension: bool | None = dspy.OutputField(
desc="check whether or not extraprostatic extension is present"
)
seminal_vesicle_invasion: bool | None = dspy.OutputField(
desc="check whether or not seminal vesicle invasion is present"
)
bladder_invasion: bool | None = dspy.OutputField(
desc="check whether or not bladder invasion is present"
)
lymphovascular_invasion: bool | None = dspy.OutputField(
desc="check whether or not lymphovascular invasion is present"
)
perineural_invasion: bool | None = dspy.OutputField(
desc="check whether or not perineural invasion is present"
)
distant_metastasis: bool | None = dspy.OutputField(
desc="check whether or not distant metastasis is present"
)
treatment_effect: str | None = dspy.OutputField(
desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None'
)
class ProstateCancerStaging(dspy.Signature):
"""you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for lung cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField(
desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.'
)
pt_category: Literal["tx", "t2", "t3a", "t3b", "t4"] | None = dspy.OutputField(
desc="identify the pt category of the tumor"
)
pn_category: Literal["nx", "n0", "n1"] | None = dspy.OutputField(
desc="identify the pn category of the tumor"
)
pm_category: Literal["mx", "m0", "m1a", "m1b", "m1c"] | None = dspy.OutputField(
desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available"
)
stage_group: (
Literal["0", "i", "iia", "iib", "iic", "iiia", "iiib", "iiic", "iva", "ivb"]
| None
) = dspy.OutputField(desc="identify the stage group of the tumor")
ajcc_version: int | None = dspy.OutputField(
desc="identify the ajcc version of the pathological staging"
)
class ProstateCancerMargins(dspy.Signature):
"""you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for lung cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
margin_positivity: bool | None = dspy.OutputField(
desc="check whether or not any margin is positive"
)
involved_margin_list: (
list[
Literal[
"right_apical",
"left_apical",
"right_bladder_neck",
"left_bladder_neck",
"right_anterior",
"left_anterior",
"right_lateral",
"left_lateral",
"right_posterolateral",
"left_posterolateral",
"right_posterior",
"left_posterior",
]
]
| None
) = dspy.OutputField(desc="list of involved margins")
margin_length: Literal["limited", "non_limited"] | None = dspy.OutputField(
desc="if margin is positive, check whether the length of positive margin is limited (<3mm) or non-limited (>=3mm)"
)
class ProstateCancerLN(dspy.Signature):
"""you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for prostate cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
regional_lymph_node: list[ProstateLN] | None = dspy.OutputField(
desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "right, "lymph_node_category": "internal_iliac", "involved": 2, "examined": 5}, ...]. If not present, just output null for every lymph node"""
)
extranodal_extension: bool | None = dspy.OutputField(
desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None"
)
maximal_ln_size: int | None = dspy.OutputField(
desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None"
)