From a91e8714812d2980b3382a6609562f763232d1a1 Mon Sep 17 00:00:00 2001 From: Farouk Adeleke Date: Sun, 30 Nov 2025 16:46:59 -0500 Subject: [PATCH] (no commit message) --- __init__.py | 10 ++ auto_classes.json | 4 + models/__init__.py | 0 models/breast.py | 305 ++++++++++++++++++++++++++++++++++++++++ models/cervix.py | 234 +++++++++++++++++++++++++++++++ models/colon.py | 271 ++++++++++++++++++++++++++++++++++++ models/common.py | 123 ++++++++++++++++ models/esophagus.py | 217 +++++++++++++++++++++++++++++ models/liver.py | 211 ++++++++++++++++++++++++++++ models/lung.py | 308 +++++++++++++++++++++++++++++++++++++++++ models/modellist.py | 70 ++++++++++ models/pancreas.py | 215 ++++++++++++++++++++++++++++ models/prostate.py | 223 +++++++++++++++++++++++++++++ models/stomach.py | 217 +++++++++++++++++++++++++++++ models/thyroid.py | 211 ++++++++++++++++++++++++++++ pipeline.py | 197 ++++++++++++++++++++++++++ pyproject.toml | 7 + util/__init__.py | 0 util/predictiondump.py | 241 ++++++++++++++++++++++++++++++++ 19 files changed, 3064 insertions(+) create mode 100644 __init__.py create mode 100644 auto_classes.json create mode 100644 models/__init__.py create mode 100644 models/breast.py create mode 100644 models/cervix.py create mode 100644 models/colon.py create mode 100644 models/common.py create mode 100644 models/esophagus.py create mode 100644 models/liver.py create mode 100644 models/lung.py create mode 100644 models/modellist.py create mode 100644 models/pancreas.py create mode 100644 models/prostate.py create mode 100644 models/stomach.py create mode 100644 models/thyroid.py create mode 100644 pipeline.py create mode 100644 pyproject.toml create mode 100644 util/__init__.py create mode 100644 util/predictiondump.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..345d0d8 --- /dev/null +++ b/__init__.py @@ -0,0 +1,10 @@ +""" +init file for total registrar, a pathology report structuring tool using LLM +""" + +# -*- coding: utf-8 -*- +__version__ = "0.1.0" +__date__ = "2025-10-05" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" diff --git a/auto_classes.json b/auto_classes.json new file mode 100644 index 0000000..c400695 --- /dev/null +++ b/auto_classes.json @@ -0,0 +1,4 @@ +{ + "AutoConfig": "pipeline.CancerPipelineConfig", + "AutoAgent": "pipeline.CancerPipeline" +} \ No newline at end of file diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/breast.py b/models/breast.py new file mode 100644 index 0000000..24efab5 --- /dev/null +++ b/models/breast.py @@ -0,0 +1,305 @@ +# -*- coding: utf-8 -*- +""" +models/breast.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on breast cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.10.0.0" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class BreastMargin(BaseModel): + margin_category: ( + Literal[ + "12_3_clock", + "3_6_clock", + "6_9_clock", + "9_12_clock", + "12_clock", + "3_clock", + "6_clock", + "9_clock", + "superficial", + "base", + ] + | None + ) = Field( + None, + description="acceptable value for surgical margins in breast cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class BreastLN(BaseModel): + lymph_node_side: Literal["right", "left", "midline"] | None = Field( + None, + description="acceptable value for lymph node side in breast cancer. If not included in these standard sides, should be classified as None.", + ) + lymph_node_category: Literal["sentinel", "nonsentinel", "others"] | None = Field( + None, + description="acceptable value for lymph node categories (i.e. stations) in breast cancer. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station here." + ) + + +class BreastBiomarker(BaseModel): + biomarker_category: Literal["er", "pr", "her2", "ki67", "others"] | None = Field( + None, + description="acceptable value for biomarker categories in breast cancer. If not included in these standard categories, should be classified as others.", + ) + expression: bool | None = Field( + None, + description="specify whether or not the biomarker is expressed here.For Her-2 please refer to the score field, and don't fill in this field.", + ) + percentage: int | None = Field( + None, + description="the percentage of tumor cells showing positive expression of the biomarker, rounded to integer. if not specified, return null", + ) + score: Literal[0, 1, 2, 3] | None = Field( + None, + description="specify the Her-2 expression score, negative: score 0 or 1, equivocal: score 2, positive: score 3 of the biomarker here, if applicable.", + ) + biomarker_name: str | None = Field( + None, description="specify the name of the biomarker here." + ) + + +class BreastCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "partial_mastectomy", + "simple_mastectomy", + "breast_conserving_surgery", + "modified_radical_mastectomy", + "total_mastectomy", + "wide_excision", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. partial mastectomy" + ) + cancer_quadrant: ( + Literal[ + "upper_outer_quadrant", + "upper_inner_quadrant", + "lower_outer_quadrant", + "lower_inner_quadrant", + "nipple", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the primary site of cancer. e.g. upper outer quadrant. please consider what side is the breast excision specimen when you determine the quadrant" + ) + cancer_clock: Literal[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] | None = ( + dspy.OutputField( + desc="identify the clock position of the cancer if mentioned. e.g. 3" + ) + ) + cancer_laterality: Literal["right", "left", "bilateral"] | None = dspy.OutputField( + desc="identify the side of the cancer. e.g. right" + ) + histology: ( + Literal[ + "invasive_carcinoma_no_special_type", + "invasive_lobular_carcinoma", + "mixed_ductal_and_lobular_carcinoma", + "tubular_adenocarcinoma", + "mucinous_adenocarcinoma", + "encapsulated_papillary_carcinoma", + "solid_papillary_carcinoma", + "inflammatory_carcinoma", + "other_special_types", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. invasive carcinoma of no special type. If the histological type is not included in the above list, please code as other_special_types and specify the histological type in the description field." + ) + tumor_size: int | None = dspy.OutputField( + desc="identify the size of the tumor in mm, rounded, if multiple tumors are present, please provide the size of the largest tumor" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class DCIS(dspy.Signature): + """ + You need to extract data about ductal carcinoma in situ below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS. + """ + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + dcis_present: bool | None = dspy.OutputField( + desc="check whether or not ductal carcinoma in situ is present" + ) + dcis_size: int | None = dspy.OutputField( + desc="if ductal carcinoma in situ is present, identify the size of the largest DCIS in mm, rounded" + ) + dcis_comedo_necrosis: bool | None = dspy.OutputField( + desc="if ductal carcinoma in situ is present, check whether or not comedo necrosis is present" + ) + dcis_grade: Literal[1, 2, 3] | None = dspy.OutputField( + desc="if ductal carcinoma in situ is present, identify the grade of DCIS, low grade (grade 1), intermediate grade (grade 2), high grade (grade 3)" + ) + + +class BreastCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: ( + Literal[ + "tx", "tis", "t1mi", "t1a", "t1b", "t1c", "t2", "t3", "t4a", "t4b", "t4c" + ] + | None + ) = dspy.OutputField(desc="identify the pt category of the tumor") + pn_category: ( + Literal[ + "nx", "n0", "n1mi", "n1a", "n1b", "n1c", "n2a", "n2b", "n3a", "n3b", "n3c" + ] + | None + ) = dspy.OutputField(desc="identify the pn category of the tumor") + pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + pathologic_stage_group: ( + Literal["0", "ia", "ib", "iia", "iib", "iiia", "iiib", "iiic", "iv"] | None + ) = dspy.OutputField( + desc="identify the pathologic stage group of the tumor, return none if only anatomical stage group is given, dont guess" + ) + anatomic_stage_group: ( + Literal["0", "ia", "ib", "iia", "iib", "iiia", "iiib", "iiic", "iv"] | None + ) = dspy.OutputField(desc="identify the anatomic stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class BreastCancerGrading(dspy.Signature): + """you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + nuclear_grade: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the nuclear grade of the tumor" + ) + tubule_formation: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the tubule formation score of the tumor" + ) + mitotic_rate: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the mitotic rate score of the tumor" + ) + total_score: Literal[3, 4, 5, 6, 7, 8, 9] | None = dspy.OutputField( + desc="identify the total score of the tumor" + ) + grade: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the grade of the tumor" + ) + + +class BreastCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[BreastMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal", "margin_involved": true, "distance": 5}, {"margin_category": "distal", "margin_involved": false, "distance": null}, {"margin_category": "radial", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class BreastCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[BreastLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "right", "lymph_node_category": "sentinel", "involved": 2, "examined": 5, "station_name": "station 1"}, {"lymph_node_side": "left", "lymph_node_category": "nonsentinel", "involved": 0, "examined": 3, "station_name": "station 2"}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) + + +class BreastCancerBiomarkers(dspy.Signature): + """you need to extract breast cancer biomarkers, which is very important, below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for breast cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + biomarkers: list[BreastBiomarker] | None = dspy.OutputField( + desc="""return all of the examined immunoreceptors using immunohistochemistry techniques,like er, pr, her2, ki67, etc. in a list of pydantic structures, + example: [{"biomarker_category": "er", "expression": true, "percentage": 90, "score": null, "biomarker_name": "estrogen receptor"}, + {"biomarker_category": "pr", "expression": false, "percentage": 0, "score": null, "biomarker_name": "progesterone receptor"}, + {"biomarker_category": "her2", "expression": null, "percentage": 0, "score": 2, "biomarker_name": "human epidermal growth factor receptor 2"}, + {"biomarker_category": "ki67", "expression": null, "percentage": 30, "score": null, "biomarker_name": "ki67 proliferation index"}]. If not present, just output null for every biomarker""" + ) diff --git a/models/cervix.py b/models/cervix.py new file mode 100644 index 0000000..d2e4fc0 --- /dev/null +++ b/models/cervix.py @@ -0,0 +1,234 @@ +# -*- coding: utf-8 -*- +""" +models/cervix.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on cervical cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "5.1.0.0" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class CervixMargin(BaseModel): + margin_category: ( + Literal[ + "ectocervical", + "endocervical", + "radial_circumferential", + "vaginal_cuff", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for surgical margins in cervical cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class CervixLN(BaseModel): + lymph_node_side: Literal["right", "left", "midline"] | None = Field( + None, + description="acceptable value for lymph node side in cervical cancer. If not included in these standard sides, should be classified as None.", + ) + lymph_node_category: ( + Literal[ + "pelvic", + "para_aortic", + "internal_iliac", + "obturator", + "external_iliac", + "common_iliac", + "parametrial", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node categories in cervical cancer. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station here." + ) + + +class CervixCancerNonnested(dspy.Signature): + """you need to extracting the value of the specified items below from the given cervical cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for cervical cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "radical_hysterectomy", + "total_hysterectomy_bso", + "simple_hysterectomy", + "extenteration", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. radical_hysterectomy" + ) + surgical_technique: Literal["open", "laparoscopic", "vaginal", "others"] | None = ( + dspy.OutputField(desc="identify how the surgery was taken. e.g. laparoscopic") + ) + cancer_primary_site: ( + Literal["12_3_clock", "3_6_clock", "6_9_clock", "9_12_clock"] | None + ) = dspy.OutputField(desc="identify the primary site of cancer. e.g. sigmoid colon") + histology: ( + Literal[ + "squamous_cell_carcinoma_hpv_associated", + "squamous_cell_carcinoma_hpv_dependaent", + "squamous_cell_carcinoma_nos", + "adenocarcinoma_hpv_associated", + "adenocarcinoma_hpv_independent", + "adenocarcinoma_nos", + "adenosquamous_carcinoma", + "neuroendocrine_carcinoma", + "glassy_cell_carcinoma", + "small_cell_carcinoma", + "large_cell_carcinoma", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. squamous cell carcinoma" + ) + grade: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the grade of the cancer, well->1, moderate->2, poor/undiff->3" + ) + tumor_size: int | None = dspy.OutputField( + desc="identify the size of the tumor in mm, rounded to integer" + ) + depth_of_invasion_number: ( + Literal["less_than_3", "3_to_5", "greater_than_5"] | None + ) = dspy.OutputField( + desc="identify the depth of invasion of the tumor in mm, and choose from these three categories: less_than_3, 3_to_5, greater_than_5" + ) + depth_of_invasion_three_tier: ( + Literal["inner_third", "middle_third", "outer_third"] | None + ) = dspy.OutputField( + desc="identify the depth of invasion of the tumor in three-tier system: inner_third, middle_third, outer_third" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class CervixCancerStaging(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for cervical cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: ( + Literal[ + "tx", + "t1a1", + "t1a2", + "t1b1", + "t1b2", + "t1b3", + "t2a1", + "t2a2", + "t2b", + "t3a", + "t3b", + "t4", + ] + | None + ) = dspy.OutputField(desc="identify the pt category of the tumor") + pn_category: Literal["nx", "n0", "n1mi", "n1a", "n2mi", "n2a"] | None = ( + dspy.OutputField(desc="identify the pn category of the tumor") + ) + pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + stage_group: ( + Literal[ + "0", + "ia1", + "ia2", + "ib1", + "ib2", + "ib3", + "iia1", + "iia2", + "iib", + "iiia", + "iiib", + "iiic1", + "iiic2", + "iva", + "ivb", + ] + | None + ) = dspy.OutputField(desc="identify the FIGO stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class CervixCancerMargins(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for cervical cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[CervixMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal", "margin_involved": true, "distance": 5}, {"margin_category": "distal", "margin_involved": false, "distance": null}, {"margin_category": "mesenteric", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class CervixCancerLN(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for cervical cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[CervixLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_category": "regional", "involved": 2, "examined": 5}, {"lymph_node_category": "mesenteric", "involved": 0, "examined": 3}]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) diff --git a/models/colon.py b/models/colon.py new file mode 100644 index 0000000..bd297cd --- /dev/null +++ b/models/colon.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- +""" +models/colon.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on colorectal cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-15 +""" + +__version__ = "1.0.1" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.3.1.0" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class ColonMargin(BaseModel): + margin_category: ( + Literal[ + "proximal", + "distal", + "mesenteric_pedicle", + "radial_or_circumferencial", + "outmost_of_adhered_tissue", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for surgical margins in colon cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class ColonLN(BaseModel): + lymph_node_category: Literal["regional", "mesenteric", "others"] | None = Field( + None, + description="acceptable value for lymph node categories in colon cancer. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node group/station here." + ) + + +class ColonBiomarker(BaseModel): + biomarker_category: ( + Literal["mlh1", "msh2", "msh6", "pms2", "her2", "others"] | None + ) = Field( + None, + description="acceptable value for biomarker categories in colon cancer. If not included in these standard categories, should be classified as others.", + ) + expression: bool | None = Field( + None, + description="specify whether or not the biomarker is expressed here.For Her-2 please refer to the score field, and don't fill in this field.", + ) + percentage: int | None = Field( + None, + description="the percentage of tumor cells showing positive expression of the biomarker, rounded to integer. if not specified, return null", + ) + score: Literal[0, 1, 2, 3] | None = Field( + None, + description="specify the Her-2 expression score, negative: score 0 or 1, equivocal: score 2, positive: score 3 of the biomarker here, if applicable.", + ) + biomarker_name: str | None = Field( + None, description="specify the name of the biomarker here." + ) + + +class ColonCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given colon cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for colon cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "right_hemicolectomy", + "extended_right_hemicolectomy", + "left_hemicolectomy", + "low_anterior_resection", + "anterior_resection", + "abdominoperineal_resection", + "total_mesorectal_excision", + "total_colectomy", + "subtotal_colectomy", + "segmental_colectomy", + "transanal_local_excision", + "polypectomy", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. polypectomy" + ) + surgical_technique: ( + Literal["open", "laparoscopic", "robotic", "ta_tme", "hybrid", "others"] | None + ) = dspy.OutputField(desc="identify how the surgery was taken. e.g. laparoscopic") + cancer_primary_site: ( + Literal[ + "cecum", + "ascending_colon", + "hepatic_flexure", + "transverse_colon", + "splenic_flexure", + "descending_colon", + "sigmoid_colon", + "rectosigmoid_junction", + "rectum", + "appendix", + ] + | None + ) = dspy.OutputField(desc="identify the primary site of cancer. e.g. sigmoid colon") + histology: ( + Literal[ + "adenocarcinoma", + "mucinous_adenocarcinoma", + "signet_ring_cell_carcinoma", + "medullary_carcinoma", + "micropapillary_adenocarcinoma", + "serrated_adenocarcinoma", + "adenosquamous_carcinoma", + "neuroendocrine_carcinoma", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. adenocarcinoma, if not in the list, classify as others" + ) + grade: int | None = dspy.OutputField( + desc="identify the grade of the cancer, well->1, moderate->2, poor->3, undiff-4" + ) + tumor_invasion: ( + Literal[ + "lamina_propria", + "submucosa", + "muscularis_propria", + "pericolorectal_tissue", + "visceral_peritoneum_surface", + "adjacent_organs_structures", + ] + | None + ) = dspy.OutputField( + desc="identify the part invasioned by tumor, e.g lamina propria" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + extracellular_mucin: bool | None = dspy.OutputField( + desc="check whether or not extracellular mucin is present" + ) + signet_ring: bool | None = dspy.OutputField( + desc="check whether or not signet ring cell is present" + ) + tumor_budding: int | None = dspy.OutputField( + desc="identify the number of tumor budding of the cancer.low->0, moderate->1, high->2" + ) + type_of_polyp: ( + Literal[ + "tubular_adenoma", + "tubulovillous_adenoma", + "villous_adenoma", + "sessile_serrated_adenoma", + "traditional_serrated_adenoma", + ] + | None + ) = dspy.OutputField(desc="identify the type of polyp of the tumor") + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class ColonCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given colon cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for colon cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" means post-therapy, "m" means multiple primary tumor, "r" means recurrent tumor, code this if the report explicitly mean this, dont guess' + ) + pt_category: Literal["tx", "tis", "t1", "t2", "t3", "t4a", "t4b"] | None = ( + dspy.OutputField(desc="identify the pt category of the tumor") + ) + pn_category: Literal["nx", "n0", "n1a", "n1b", "n1c", "n2a", "n2b"] | None = ( + dspy.OutputField(desc="identify the pn category of the tumor") + ) + pm_category: Literal["mx", "m0", "m1a", "m1b", "m1c"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + stage_group: ( + Literal[ + "0", "i", "iia", "iib", "iic", "iiia", "iiib", "iiic", "iva", "ivb", "ivc" + ] + | None + ) = dspy.OutputField(desc="identify the stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class ColonCancerMargins(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for colon cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[ColonMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal", "margin_involved": true, "distance": 5}, {"margin_category": "distal", "margin_involved": false, "distance": null}, {"margin_category": "mesenteric", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class ColonCancerLN(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for colon cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[ColonLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_category": "regional", "involved": 2, "examined": 5}, {"lymph_node_category": "mesenteric", "involved": 0, "examined": 3}]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) + + +class ColonCancerBiomarkers(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for colon cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + biomarkers: list[ColonBiomarker] | None = dspy.OutputField( + desc="""return all of the examined immunoreceptors using immunohistochemistry techniques,like mlh1, msh2, etc. in JSON format. example:{"MSH6": {"expression": true,"percentage": 90}, "PMS2": {"expression": true, "percentage": 100}},"mlh1": {"expression": true, "percentage": 100}}.""" + ) diff --git a/models/common.py b/models/common.py new file mode 100644 index 0000000..9a56b31 --- /dev/null +++ b/models/common.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +""" +models/common.py +This script sets up a series of data extraction models using the dspy library for pathology reports. Common model includes basic dspy functionality, cancer examination, and json handling. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" + +import dspy +from typing import Literal + +model_list = { + "gemma4b": "ollama_chat/gemma3:4b", + "gemma1b": "ollama_chat/gemma3:1b", + "med8b": "ollama_chat/thewindmom/llama3-med42-8b", + "gemma12b": "ollama_chat/gemma3:12b", + "gemma27b": "ollama_chat/gemma3:27b", + "med70b": "ollama_chat/thewindmom/llama3-med42-70b", + "gpt": "ollama_chat/gpt-oss:20b", + "phi4": "ollama_chat/phi4", + "qwen30b": "ollama_chat/qwen3:30b", +} + +localaddr = "http://localhost:11434" + + +def load_model(model_name: str): + if model_name not in model_list: + raise ValueError( + f"Model {model_name} not found. Available models: {list(model_list.keys())}" + ) + + model = model_list[model_name] + lm = dspy.LM( + model=model, + api_base=localaddr, + api_key="", + model_type="chat", + top_p=0.7, + max_tokens=16384, + num_ctx=16384, + temperature=0.7, + seed=10, + ) + print(f"Loaded model: {model_name}") + return lm + + +# 2 . define classes and set up Signatures + + +def autoconf_dspy(model_name: str): + lm = load_model(model_name) + dspy.configure(lm=lm) + + +class is_cancer(dspy.Signature): + """You are a cancer registrar, you need to identify whether or not this report belongs to PRIMARY cancer excision eligible for cancer registry, and if so, which organ the cancer belongs to. If no viable tumor is present after excision, you should not register this case. If only carcinoma in situ or high-grade dysplasia, you should not register this case.""" + + report: list = dspy.InputField( + desc="this is a pathologic report, separated into paragraphs. you should determine whether or not this report belongs to cancer excision eligible for cancer registry" + ) + + cancer_excision_report: bool = dspy.OutputField( + desc="identify whether or not this report belongs to PRIMARY cancer excision eligible for registry for cancer excision. If no viable tumor is present after excision, you should not register this case. If only carcinoma in situ or high-grade dysplasia, you should not register this case." + ) # a point + # exp: + cancer_category: ( + Literal[ + "stomach", + "colorectal", + "breast", + "esophagus", + "lung", + "prostate", + "thyroid", + "pancreas", + "cervix", + "liver", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which organ the primary cancer arises from. Currently only ten are implemented, if it IS a cancer excision report, but primary site not included in these standard organs, should be classified as others." + ) + cancer_category_others_description: str | None = dspy.OutputField( + desc="if is cancer_excision report AND cancer_category is others, please specify the organ here. if not, return null." + ) + + +class ReportJsonize(dspy.Signature): + """You are cancer registrar, and you are assigned a task to manually convert the raw pathology report into a roughly structured json format. Keep original wording as much as possible. Try to follow the order of cancer checklists.""" + + report: list = dspy.InputField( + desc="this is a raw pathological report, separated into paragraphs. You need to convert it into a roughly structured json format, keeping original wording as much as possible." + ) + cancer_category: ( + Literal[ + "stomach", + "colorectal", + "breast", + "esophagus", + "lung", + "prostate", + "pancreas", + "thyroid", + "cervix", + "liver", + ] + | None + ) = dspy.InputField( + desc="which part the cancer belongs to. You need to convert it into a roughly structured json format, keeping original wording as much as possible." + ) + output: dict = dspy.OutputField( + desc="You are cancer registrar, and you are assigned a task to manually convert the raw pathology report into a roughly structured json format. Keep original wording as much as possible. Try to follow the order of cancer checklists." + ) diff --git a/models/esophagus.py b/models/esophagus.py new file mode 100644 index 0000000..d71b219 --- /dev/null +++ b/models/esophagus.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +""" +models/esophagus.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on esophageal cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.2.0.1" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class EsophagusMargin(BaseModel): + margin_category: ( + Literal["proximal", "distal", "radial", "lateral", "deep", "others"] | None + ) = Field( + None, + description="acceptable value for surgical margins in esophageal cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class EsophagusLN(BaseModel): + lymph_node_category: ( + Literal[ + "regional_esophageal", + "regional_gastric", + "thoracic_1", + "thoracic_1r", + "thoracic_1l", + "thoracic_4", + "thoracic_4r", + "thoracic_4l", + "thoracic_7", + "thoracic_8u", + "thoracic_8m", + "thoracic_8l", + "thoracic_8", + "thoracic_9", + "thoracic_9r", + "thoracic_9l", + "thoracic_10", + "thoracic_10r", + "thoracic_10l", + "abdomen_106", + "abdomen_1", + "abdomen_2", + "abdomen_3", + "abdomen_4", + "abdomen_5", + "abdomen_6", + "abdomen_7", + "abdomen_8", + "abdomen_9", + "abdomen_10", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node categories (i.e. stations or groups) in esophageal cancer. Default to thoracic if not mentioned. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station/group here." + ) + + +class EsophagusCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given esophagus cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for esophagus cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "endoscopic_resection", "esophagectomy", "esophagogastrectomy", "others" + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. polypectomy" + ) + surgical_technique: ( + Literal["open", "thoracoscopic", "robotic", "hybrid", "endoscopic", "others"] + | None + ) = dspy.OutputField(desc="identify how the surgery was taken. e.g. thoracoscopic") + cancer_primary_site: ( + Literal[ + "upper_third", "middle_third", "lower_third", "gastroesophageal_junction" + ] + | None + ) = dspy.OutputField(desc="identify the primary site of cancer. e.g. upper_third") + histology: ( + Literal[ + "squamous_cell_carcinoma", + "adenocarcinoma", + "adenoid_cystic_carcinoma", + "mucoepidermoid_carcinoma", + "basaloid_squamous_cell_carcinoma", + "small_cell_carcinoma", + "large_cell_carcinoma", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. squamous cell carcinoma" + ) + grade: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the grade of the cancer, well->1, moderate->2, poor->3" + ) + tumor_extent: ( + Literal[ + "mucosa", + "submucosa", + "muscularis_propria", + "adventitia", + "adjacent_structures", + ] + | None + ) = dspy.OutputField( + desc="identify how deep the tumor invades. e.g. muscularis_propria" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class EsophagusCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given esophagus cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for esophagus cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: Literal["tx", "t1a", "t1b", "t2", "t3", "t4a", "t4b"] | None = ( + dspy.OutputField(desc="identify the pt category of the tumor") + ) + pn_category: Literal["nx", "n0", "n1", "n2", "n3"] | None = dspy.OutputField( + desc="identify the pn category of the tumor" + ) + pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + stage_group: ( + Literal["0", "i", "ia", "ib", "ic", "iia", "iib", "iiia", "iiib", "iva", "ivb"] + | None + ) = dspy.OutputField(desc="identify the stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class EsophagusCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given esophagus cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for esophagus cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[EsophagusMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal", "margin_involved": true, "distance": 5}, {"margin_category": "distal", "margin_involved": false, "distance": null}, {"margin_category": "radial", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class EsophagusCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given esophagus cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for esophagus cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[EsophagusLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_category": "thoracic_1", "involved": 2, "examined": 5}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) diff --git a/models/liver.py b/models/liver.py new file mode 100644 index 0000000..d817618 --- /dev/null +++ b/models/liver.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- +""" +models/liver.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on hepatocellular carcinoma (cholangiocarcinoma not included). It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.3.0.0" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class LiverMargin(BaseModel): + margin_category: ( + Literal["parenchymal", "hepatic_vein", "portal_vein", "bile_duct", "others"] + | None + ) = Field( + None, + description="acceptable value for surgical margins in liver cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class LiverLN(BaseModel): + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station here." + ) + + +class LiverCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given hepatocellular carcinoma excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for hepatocellular carcinoma excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "wedge_resection", + "partial_hepatectomy", + "segmentectomy", + "lobectomy", + "total_hepatectomy", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. partial hepatectomy" + ) + tumor_site: ( + Literal["right_lobe", "left_lobe", "caudate_lobe", "quadrate_lobe", "others"] + | None + ) = dspy.OutputField(desc="identify the site of the tumor. e.g. right lobe") + histology: ( + Literal[ + "hepatocellular_carcinoma", + "hepatocellular_carcinoma_fibrolamellar", + "hepatocellular_carcinoma_scirrhous", + "hepatocellular_carcinoma_clear_cell", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. invasive carcinoma of no special type" + ) + grade: Literal[1, 2, 3, 4] | None = dspy.OutputField( + desc="identify the grade of the cancer, well->1, moderate->2, poor->3, undifferentiated->4" + ) + tumor_size: int | None = dspy.OutputField( + desc="identify the size of the tumor in mm, rounded, if multiple tumors are present, please provide the size of the largest tumor" + ) + tumor_focality: Literal["unifocal", "multifocal"] | None = dspy.OutputField( + desc="identify whether the tumor is unifocal or multifocal" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class LiverCancerExtent(dspy.Signature): + """ + You need to determine whether the tumor have extended out of liver. If so, extract list of tumor extension to one or more of the specified items below from the given hepatocellular carcinoma excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS. + """ + + report: list = dspy.InputField( + desc="this is a pathological report for hepatocellular carcinoma excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tumor_extent: ( + list[ + Literal[ + "hepatic_vein", + "portal_vein", + "visceral_peritoneum", + "gallbladder", + "diaphragm", + "others", + ] + ] + | None + ) = dspy.OutputField( + desc='return all of the possible tumor extension. example: ["hepatic_vein", "gallbladder"]. If not present, just output None, do not overdiagnosis' + ) + + +class LiverCancerVascularInvasion(dspy.Signature): + """ + You need to determine whether the tumor have vascular invasion. If so, extract list of vascular invasion to one or more of the specified items below from the given hepatocellular carcinoma excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS. + """ + + report: list = dspy.InputField( + desc="this is a pathological report for hepatocellular carcinoma excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + vascular_invasion: ( + list[Literal["large_hepatic_vein", "large_portal_vein", "small_vessel"]] | None + ) = dspy.OutputField( + desc='return all of the possible vascular invasion. example: ["large_hepatic_vein", "small_vessel"]. If not present, just output None, do not overdiagnosis' + ) + + +class LiverCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given liver cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for hepatocellular carcinoma excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: Literal["tx", "t1a", "t1b", "t2", "t3", "t4"] | None = ( + dspy.OutputField(desc="identify the pt category of the tumor") + ) + pn_category: Literal["nx", "n0", "n1"] | None = dspy.OutputField( + desc="identify the pn category of the tumor" + ) + pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + overall_stage: Literal["ia", "ib", "ii", "iiia", "iiib", "iva", "ivb"] | None = ( + dspy.OutputField(desc="identify the overall stage of the tumor") + ) + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class LiverCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given hepatocellular carcinoma excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for hepatocellular carcinoma excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[LiverMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal_duodenal", "margin_involved": true, "distance": null}, {"margin_category": "anterior_outmost", "margin_involved": false, "distance": 10}, {"margin_category": "radial", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class LiverCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given hepatocellular carcinoma excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for hepatocellular carcinoma excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[LiverLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"involved": 2, "examined": 5, "station_name": "station 1"}, {"involved": 0, "examined": 3, "station_name": "station 2"}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) diff --git a/models/lung.py b/models/lung.py new file mode 100644 index 0000000..609b0d9 --- /dev/null +++ b/models/lung.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- +""" +models/lung.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on lung cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.3.0.1" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class LungMargin(BaseModel): + margin_category: ( + Literal["bronchial", "vascular", "parenchymal", "chest_wall", "others"] | None + ) = Field( + None, + description="acceptable value for surgical margins in lung cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class LungLN(BaseModel): + lymph_node_side: Literal["right", "left"] | None = Field( + None, + description="acceptable value for lymph node side in lung cancer. If not included in these standard sides, should be classified as None.", + ) + lymph_node_category: ( + Literal[ + "peribronchial", + "1", + "2", + "4", + "5", + "6", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "3a", + "3p", + "7", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node categories (i.e. stations or groups) in lung cancer. If not included in these standard lymph node 'stations'/'groups' should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station/group here." + ) + + +class LungHistologicalPattern(BaseModel): + pattern_name: ( + Literal["acinar", "lepidic", "papillary", "solid", "micropapillary", "others"] + | None + ) = Field( + None, description="histological pattern of invasive non-mucinous adenocarcinoma" + ) + pattern_percentage: int | None = Field( + None, description="percentage of the histological pattern" + ) + + +class LungBiomarker(BaseModel): + biomarker_category: Literal["ALK", "ROS1", "PDL1", "others"] | None = Field( + None, + description="acceptable value for biomarker categories in lung cancer. If not included in these standard categories, should be classified as others.", + ) + expression: bool + percentage: int | None = Field( + None, + description="the percentage of tumor cells showing positive expression of the biomarker, rounded to integer. if not specified, return None", + ) + biomarker_name: str | None = Field( + None, description="specify the name of the biomarker here." + ) + + +class LungCancerNonnested(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "wedge_resection", + "segmentectomy", + "lobectomy", + "completion_lobectomy", + "sleeve_lobectomy", + "bilobectomy", + "pneumonectomy", + "major_airway_resection", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. polypectomy" + ) + surgical_technique: ( + Literal["open", "thoracoscopic", "robotic", "hybrid", "others"] | None + ) = dspy.OutputField(desc="identify how the surgery was taken. e.g. thoracoscopic") + sideness: Literal["right", "left", "midline"] | None = dspy.OutputField( + desc="identify which side the tumor is located. e.g. right" + ) + cancer_primary_site: ( + Literal[ + "upper_lobe", + "middle_lobe", + "lower_lobe", + "main_bronchus", + "bronchus_intermedius", + "bronchus_lobar", + "others", + ] + | None + ) = dspy.OutputField(desc="identify the primary site of cancer. e.g. upper_lobe") + tumor_focality: ( + Literal[ + "single_focus", + "separate_in_same_lobe_t3", + "separate_nodule_in_ipsilateral_t4", + "separate_nodule_in_contralateral_m1a", + ] + | None + ) = dspy.OutputField( + desc="identify whether the tumor is single focus or multiple foci; this is important for t and m category" + ) + histology: ( + Literal[ + "adenocarcinoma", + "squamous_cell_carcinoma", + "adenosquamous_carcinoma", + "large_cell_carcinoma", + "large_cell_neuroendocrine_carcinoma", + "small_cell_carcinoma", + "carcinoid_tumor", + "sarcomatoid_carcinoma", + "pleomorphic_carcinoma", + "pulmonary_lymphoepithelioma_like_carcinoma", + "mucoepidermoid_carcinoma", + "salivary_gland_type_tumor", + "non_small_cell_carcinoma_not_specified", + "non_small_cell_carcinoma_with_neuroendocrine_features", + "other", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. adenocarcinoma" + ) + grade: int | None = dspy.OutputField( + desc="identify the grade of the cancer, well->1, moderate->2, poor->3, undiff->4" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + spread_through_air_spaces_stas: bool | None = dspy.OutputField( + desc="check whether or not spread through air spaces (STAS) is present" + ) + visceral_pleural_invasion: bool | None = dspy.OutputField( + desc="check whether or not visceral pleural invasion is present" + ) + direct_invasion_of_adjacent_structures: bool | None = dspy.OutputField( + desc="check whether or not direct invasion of adjacent structures is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class LungCancerStaging(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: ( + Literal["tx", "tis", "t1mi", "t1a", "t1b", "t1c", "t2a", "t2b", "t3", "t4"] + | None + ) = dspy.OutputField(desc="identify the pt category of the tumor") + pn_category: Literal["nx", "n0", "n1", "n2", "n3"] | None = dspy.OutputField( + desc="identify the pn category of the tumor" + ) + pm_category: Literal["mx", "m0", "m1a", "m1b", "m1c"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + stage_group: ( + Literal[ + "0", + "ia1", + "ia2", + "ia3", + "ib", + "iia", + "iib", + "iiia", + "iiib", + "iiic", + "iva", + "ivb", + "ivc", + ] + | None + ) = dspy.OutputField(desc="identify the stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class LungCancerMargins(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[LungMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "bronchial", "margin_involved": true, "distance": 5}, {"margin_category": "vascular", "margin_involved": false, "distance": null}, {"margin_category": "parenchymal", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class LungCancerLN(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[LungLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "right", "lymph_node_category": "4", "involved": 2, "examined": 5}, {"lymph_node_side": "left", "lymph_node_category": "7", "involved": 0, "examined": 3}]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) + + +class LungCancerBiomarkers(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + biomarkers: list[LungBiomarker] | None = dspy.OutputField( + desc="""return all of the examined biomarkers using immunohistochemistry techniques,like alk, ros1, pd-l1. in JSON format. example:[{"biomarker_category": "ALK", "expression": true, "percentage": 70}, {"biomarker_category": "PDL1", "expression": false, "percentage": 0}]. If not present, just output None for every biomarker""" + ) + + +class LungCancerOthernested(dspy.Signature): + """you need to extracting the value of the specified items below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + histological_patterns: list[LungHistologicalPattern] | None = dspy.OutputField( + desc="if the histology is invasive non-mucinous adenocarcinoma, please identify all of the histological patterns and its percentage. if not, return None" + ) diff --git a/models/modellist.py b/models/modellist.py new file mode 100644 index 0000000..c1903c1 --- /dev/null +++ b/models/modellist.py @@ -0,0 +1,70 @@ +organmodels = { + "lung": [ + "LungCancerNonnested", + "LungCancerStaging", + "LungCancerMargins", + "LungCancerLN", + "LungCancerBiomarkers", + "LungCancerOthernested", + ], + "colorectal": [ + "ColonCancerNonnested", + "ColonCancerStaging", + "ColonCancerMargins", + "ColonCancerLN", + "ColonCancerBiomarkers", + ], + "prostate": [ + "ProstateCancerNonnested", + "ProstateCancerStaging", + "ProstateCancerMargins", + "ProstateCancerLN", + ], + "esophagus": [ + "EsophagusCancerNonnested", + "EsophagusCancerStaging", + "EsophagusCancerMargins", + "EsophagusCancerLN", + ], + "breast": [ + "BreastCancerNonnested", + "DCIS", + "BreastCancerGrading", + "BreastCancerStaging", + "BreastCancerMargins", + "BreastCancerLN", + "BreastCancerBiomarkers", + ], + "pancreas": [ + "PancreasCancerNonnested", + "PancreasCancerStaging", + "PancreasCancerMargins", + "PancreasCancerLN", + ], + "thyroid": [ + "ThyroidCancerNonnested", + "ThyroidCancerStaging", + "ThyroidCancerMargins", + "ThyroidCancerLN", + ], + "cervix": [ + "CervixCancerNonnested", + "CervixCancerStaging", + "CervixCancerMargins", + "CervixCancerLN", + ], + "liver": [ + "LiverCancerNonnested", + "LiverCancerExtent", + "LiverCancerVascularInvasion", + "LiverCancerStaging", + "LiverCancerMargins", + "LiverCancerLN", + ], + "stomach": [ + "StomachCancerNonnested", + "StomachCancerStaging", + "StomachCancerMargins", + "StomachCancerLN", + ], +} diff --git a/models/pancreas.py b/models/pancreas.py new file mode 100644 index 0000000..7f2dbe1 --- /dev/null +++ b/models/pancreas.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +""" +models/pancreas.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on pancreas cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.2.0.2" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class PancreasMargin(BaseModel): + margin_category: ( + Literal[ + "distal_pancreatic", + "proximal_pancreatic", + "pancreatic_neck", + "uncinate", + "bile_duct", + "proximal_gastric", + "proximal_duodenal", + "distal_intestinal", + "outmost", + "anterior_outmost", + "posterior_outmost", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for surgical margins in pancreas cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class PancreasLN(BaseModel): + lymph_node_category: ( + Literal[ + "regional_pancreatic", + "regional_gastric", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node categories in pancreas cancer. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station/group here." + ) + + +class PancreasCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given pancreas cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for pancreas cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "partial_pancreatectomy", + "ssppd", + "pppd", + "whipple_procedure", + "distal_pancreatectomy", + "total_pancreatectomy", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. partial pancreatectomy, pylorus-preserving pancreaticoduodenectomy(PPPD), Subtotal stomach-preserving pancreaticoduodenectomy (SSPPD), Whipple procedure, etc." + ) + tumor_site: ( + Literal["head", "neck", "body", "tail", "uncinate_process", "others"] | None + ) = dspy.OutputField( + desc="identify the primary site of cancer. e.g. head of pancreas" + ) + histology: ( + Literal[ + "ductal_adenocarcinoma_nos", + "ipmn_with_carcinoma", + "itpn_with_carcinoma", + "acinar_cell_carcinoma", + "solid_pseudopapillary_neoplasm", + "undifferentiated_carcinoma", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. invasive carcinoma of no special type" + ) + tumor_size: int | None = dspy.OutputField( + desc="identify the size of the tumor in mm, rounded, if multiple tumors are present, please provide the size of the largest tumor" + ) + tumor_extension: ( + Literal[ + "within_pancreas", + "peripancreatic_soft_tissue", + "adjacent_organs_structures", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the extent of tumor extension. e.g. within pancreas" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class PancreasCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given pancreas cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for pancreas cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: Literal["tx", "tis", "t1a", "t1b", "t1c", "t2", "t3", "t4"] | None = ( + dspy.OutputField(desc="identify the pt category of the tumor") + ) + pn_category: Literal["nx", "n0", "n1", "n2"] | None = dspy.OutputField( + desc="identify the pn category of the tumor" + ) + pm_category: Literal["mx", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + overall_stage: Literal["ia", "ib", "iia", "iib", "iii", "iv"] | None = ( + dspy.OutputField(desc="identify the overall stage of the tumor") + ) + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class PancreasCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given pancreas cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for pancreas cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[PancreasMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal_duodenal", "margin_involved": true, "distance": null}, {"margin_category": "anterior_outmost", "margin_involved": false, "distance": 10}, {"margin_category": "radial", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class PancreasCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given pancreas cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for pancreas cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[PancreasLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_category": "1", "involved": 2, "examined": 5, "station_name": "station 1"}, {"lymph_node_category": "2", "involved": 0, "examined": 3, "station_name": "station 2"}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer. if not, return None" + ) diff --git a/models/prostate.py b/models/prostate.py new file mode 100644 index 0000000..5f0bcb6 --- /dev/null +++ b/models/prostate.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +""" +models/prostate.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on prostate cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.3.0.0" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class ProstateLN(BaseModel): + lymph_node_side: Literal["right", "left", "midline"] | None = Field( + None, + description="acceptable value for lymph node side in cervical cancer. If not included in these standard sides, should be classified as None.", + ) + lymph_node_category: ( + Literal[ + "hypogastric", + "obturator", + "external_iliac", + "internal_iliac", + "common_iliac", + "iliac_nos", + "pelvic_nos", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node categories (i.e. stations) in prostate cancer. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node group/station here." + ) + + +class ProstateCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: Literal["radical_prostatectomy", "others"] | None = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. radical prostatectomy" + ) + surgical_technique: Literal["open", "robotic", "hybrid", "others"] | None = ( + dspy.OutputField(desc="identify how the surgery was taken. e.g. robotic") + ) + prostate_weight: int | None = dspy.OutputField( + desc="identify the weight of the prostate in grams. e.g. 50" + ) + prostate_size: int | None = dspy.OutputField( + desc="identify the size of the prostate in mm, largest dimension. e.g. 45 means 45mm" + ) + histology: ( + Literal[ + "acinar_adenocarcinoma", + "intraductal_carcinoma", + "ductal_adenocarcinoma", + "mixed_acinar_ductal", + "neuroendocrine_carcinoma_small_cell", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. acinar adenocarcinoma" + ) + grade: ( + Literal[ + "group_1_3_3", + "group_2_3_4", + "group_3_4_3", + "group_4_4_4", + "group_5_4_5", + "group_5_5_4", + "group_5_5_5", + ] + | None + ) = dspy.OutputField( + desc="identify the gleason group of the cancer. e.g. group_2_3_4 means gleason score 7 (3+4)" + ) + gleason_4_percentage: int | None = dspy.OutputField( + desc="identify the percentage of gleason pattern 4. e.g. 20 means 20%" + ) + gleason_5_percentage: int | None = dspy.OutputField( + desc="identify the percentage of gleason pattern 5. e.g. 0 means 0%" + ) + intraductal_carcinoma_presence: bool | None = dspy.OutputField( + desc="check whether or not intraductal carcinoma is present" + ) + cribriform_pattern_presence: bool | None = dspy.OutputField( + desc="check whether or not cribriform pattern is present. only in gleason score 7 or 8" + ) + tumor_percentage: int | None = dspy.OutputField( + desc="identify the percentage of tumor in the prostate in both lobes. e.g. 30 means 30%" + ) + tumor_size: int | None = dspy.OutputField( + desc="identify the size of the tumor in mm, largest dimension. e.g. 15 means 15mm" + ) + extraprostatic_extension: bool | None = dspy.OutputField( + desc="check whether or not extraprostatic extension is present" + ) + seminal_vesicle_invasion: bool | None = dspy.OutputField( + desc="check whether or not seminal vesicle invasion is present" + ) + bladder_invasion: bool | None = dspy.OutputField( + desc="check whether or not bladder invasion is present" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class ProstateCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: Literal["tx", "t2", "t3a", "t3b", "t4"] | None = dspy.OutputField( + desc="identify the pt category of the tumor" + ) + pn_category: Literal["nx", "n0", "n1"] | None = dspy.OutputField( + desc="identify the pn category of the tumor" + ) + pm_category: Literal["mx", "m0", "m1a", "m1b", "m1c"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + stage_group: ( + Literal["0", "i", "iia", "iib", "iic", "iiia", "iiib", "iiic", "iva", "ivb"] + | None + ) = dspy.OutputField(desc="identify the stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class ProstateCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for lung cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margin_positivity: bool | None = dspy.OutputField( + desc="check whether or not any margin is positive" + ) + involved_margin_list: ( + list[ + Literal[ + "right_apical", + "left_apical", + "right_bladder_neck", + "left_bladder_neck", + "right_anterior", + "left_anterior", + "right_lateral", + "left_lateral", + "right_posterolateral", + "left_posterolateral", + "right_posterior", + "left_posterior", + ] + ] + | None + ) = dspy.OutputField(desc="list of involved margins") + margin_length: Literal["limited", "non_limited"] | None = dspy.OutputField( + desc="if margin is positive, check whether the length of positive margin is limited (<3mm) or non-limited (>=3mm)" + ) + + +class ProstateCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given prostate cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for prostate cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[ProstateLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "right, "lymph_node_category": "internal_iliac", "involved": 2, "examined": 5}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) diff --git a/models/stomach.py b/models/stomach.py new file mode 100644 index 0000000..a47b9c2 --- /dev/null +++ b/models/stomach.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +""" +models/stomach.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on stomach cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13 +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.2.0.1" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class StomachMargin(BaseModel): + margin_category: ( + Literal["proximal", "distal", "radial", "lateral", "deep", "others"] | None + ) = Field( + None, + description="acceptable value for surgical margins in stomach cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class StomachLN(BaseModel): + lymph_node_category: ( + Literal[ + "regional", + "regional_lesser_curv", + "regional_greater_curv", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node groups in stomach cancer. If not included in these standard lymph node groups, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node group here." + ) + + +class StomachCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given stomach cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for stomach cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + procedure: ( + Literal[ + "endoscopic_resection", "partial_gastrectomy", "total_gastrectomy", "others" + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. polypectomy" + ) + surgical_technique: ( + Literal["open", "laparoscopic", "robotic", "hybrid", "others"] | None + ) = dspy.OutputField(desc="identify how the surgery was taken. e.g. laparoscopic") + cancer_primary_site: ( + Literal["cardia", "fundus", "body", "antrum", "pylorus", "others"] | None + ) = dspy.OutputField(desc="identify the primary site of cancer. e.g. antrum") + histology: ( + Literal[ + "tubular_adenocarcinoma", + "poorly_cohesive_carcinoma", + "mixed_tubular_poorly_cohesive", + "mucinous_adenocarcinoma", + "mixed_mucinous_poorly_cohesive", + "hepatoid_carcinoma", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. tubular adenocarcinoma" + ) + grade: Literal[1, 2, 3] | None = dspy.OutputField( + desc="identify the grade of the cancer, well->1, moderate->2, poor->3" + ) + tumor_extent: ( + Literal[ + "lamina_propria", + "muscularis_mucosae", + "submucosa", + "muscularis_propria", + "penetrate_subserosal_connective_tissue_no_serosa", + "invades_serosa_without_adjacent_structure_invasion", + "invades_adjacent_structures", + ] + | None + ) = dspy.OutputField( + desc="identify how deep the tumor invades. e.g. penetrate_subserosal_connective_tissue_no_serosa for a t3 tumor" + ) + extracellular_mucin: bool | None = dspy.OutputField( + desc="check whether or not extracellular mucin is present" + ) + signet_ring: bool | None = dspy.OutputField( + desc="check whether or not signet ring cell is present" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class StomachCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given stomach cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for stomach cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: Literal["tx", "t1a", "t1b", "t2", "t3", "t4a", "t4b"] | None = ( + dspy.OutputField(desc="identify the pt category of the tumor") + ) + pn_category: Literal["nx", "n0", "n1", "n2", "n3a", "n3b"] | None = ( + dspy.OutputField(desc="identify the pn category of the tumor") + ) + pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + stage_group: ( + Literal[ + "0", + "i", + "ii", + "iii", + "iv", + "ia", + "ib", + "iia", + "iib", + "iiia", + "iiib", + "iiic", + ] + | None + ) = dspy.OutputField(desc="identify the stage group of the tumor") + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class StomachCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given stomach cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for stomach cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[StomachMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal", "margin_involved": true, "distance": 5}, {"margin_category": "distal", "margin_involved": false, "distance": null}, {"margin_category": "radial", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class StomachCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given stomach cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for stomach cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[StomachLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_category": "1", "involved": 2, "examined": 5, "station_name": "station 1"}, {"lymph_node_category": "2", "involved": 0, "examined": 3, "station_name": "station 2"}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) diff --git a/models/thyroid.py b/models/thyroid.py new file mode 100644 index 0000000..de9252a --- /dev/null +++ b/models/thyroid.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- +""" +models/thyroid.py +This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on thyroid cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats. + +author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-13" +""" + +__version__ = "1.0.0" +__date__ = "2025-10-13" +__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" +__ajcc_version__ = 8 +__cap_version__ = "4.4.0.0" + +import dspy +from typing import Literal +from pydantic import BaseModel, Field + + +class ThyroidMargin(BaseModel): + margin_category: ( + Literal["outmost", "anterior_outmost", "posterior_outmost", "isthmus", "others"] + | None + ) = Field( + None, + description="acceptable value for surgical margins in thyroid cancer. If not included in these standard margins, should be classified as others.", + ) + margin_involved: bool + distance: int | None = Field( + None, + description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null", + ) + description: str | None + + +class ThyroidLN(BaseModel): + lymph_node_side: Literal["right", "left", "midline"] | None = Field( + None, + description="acceptable value for lymph node side in cervical cancer. If not included in these standard sides, should be classified as None.", + ) + lymph_node_category: ( + Literal[ + "level_vi_central", + "level_i", + "level_ii", + "level_iii", + "level_iv", + "level_v", + "level_vii", + "others", + ] + | None + ) = Field( + None, + description="acceptable value for lymph node categories in thyroid cancer. If not included in these standard lymph node 'station' number, should be classified as others.", + ) + involved: int + examined: int + station_name: str | None = Field( + None, description="specify the name of the lymph node station here." + ) + + +class ThyroidCancerNonnested(dspy.Signature): + """you need to extract the value of the specified items below from the given pancreas cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for thyroid cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + predisposing_condition: Literal["radiation", "family_history"] | None = ( + dspy.OutputField( + desc='identify any predisposing condition mentioned in the report, such as "Hashimoto thyroiditis" or "radiation exposure". If none mentioned, return None' + ) + ) + procedure: ( + Literal[ + "partial_excision", + "right_lobectomy", + "left_lobectomy", + "total_thyroidectomy", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify which surgery procedure was used. e.g. partial thyroidectomy" + ) + tumor_focality: Literal["unifocal", "multifocal", "not_specified"] | None = ( + dspy.OutputField( + desc="identify whether the tumor is unifocal or multifocal. If not specified, return not_specified" + ) + ) + tumor_site: ( + Literal["right_lobe", "left_lobe", "isthmus", "both_lobe", "others"] | None + ) = dspy.OutputField( + desc="identify the primary site of cancer. e.g. right lobe of thyroid" + ) + histology: ( + Literal[ + "papillary_thyroid_carcinoma", + "follicular_thyroid_carcinoma", + "medullary_thyroid_carcinoma", + "anaplastic_thyroid_carcinoma", + "others", + ] + | None + ) = dspy.OutputField( + desc="identify the histological type of the cancer. e.g. papillary thyroid carcinoma" + ) + tumor_size: int | None = dspy.OutputField( + desc="identify the size of the tumor in mm, rounded, if multiple tumors are present, please provide the size of the largest tumor" + ) + mitotic_activity: Literal["less_than_3", "3_to_5", "more_than_5"] | None = ( + dspy.OutputField( + desc="identify the mitotic activity of the tumor, in mitoses per 10 high power fields. if not specified, return null" + ) + ) + extrathyroid_extension: ( + Literal[ + "microscopic_strap_muscle", + "macroscopic_strap_muscle_t3b", + "subcutaneous_trachea_esophagus_rln_t4a", + "prevertebral_carotid_mediastinal_t4b", + ] + | None + ) = dspy.OutputField( + desc="identify if clinically extrathyroid extension is present, only four categories are allowed, if none specified, return None" + ) + tumor_necrosis: bool | None = dspy.OutputField( + desc="check whether or not tumor necrosis is present" + ) + lymphovascular_invasion: bool | None = dspy.OutputField( + desc="check whether or not lymphovascular invasion is present" + ) + perineural_invasion: bool | None = dspy.OutputField( + desc="check whether or not perineural invasion is present" + ) + distant_metastasis: bool | None = dspy.OutputField( + desc="check whether or not distant metastasis is present" + ) + treatment_effect: str | None = dspy.OutputField( + desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None' + ) + + +class ThyroidCancerStaging(dspy.Signature): + """you need to extract the value of the specified items below from the given thyroid cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for thyroid cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField( + desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.' + ) + pt_category: ( + Literal["tx", "t1a", "t1b", "t2", "t3a", "t3b", "t4a", "t4b"] | None + ) = dspy.OutputField(desc="identify the pt category of the tumor") + pn_category: Literal["nx", "n0", "n1a", "n1b"] | None = dspy.OutputField( + desc="identify the pn category of the tumor" + ) + pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField( + desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available" + ) + overall_stage: Literal["i", "ii", "iii", "iva", "ivb", "ivc"] | None = ( + dspy.OutputField(desc="identify the overall stage of the tumor") + ) + ajcc_version: int | None = dspy.OutputField( + desc="identify the ajcc version of the pathological staging" + ) + + +class ThyroidCancerMargins(dspy.Signature): + """you need to extract the value of the specified items below from the given thyroid cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for thyroid cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + margins: list[ThyroidMargin] | None = dspy.OutputField( + desc="""return all of the possible involved margins and its distance from cancer. example:[ {"margin_category": "anterior_outmost", "margin_involved": false, "distance": 10}, {"margin_category": "isthmus", "margin_involved": false, "distance": null}]. If not present, just output null for every margin""" + ) + + +class ThyroidCancerLN(dspy.Signature): + """you need to extract the value of the specified items below from the given thyroid cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.""" + + report: list = dspy.InputField( + desc="this is a pathological report for thyroid cancer excision, separated into paragraphs." + ) + report_jsonized: dict = dspy.InputField( + desc="this is a roughly structured json summary of the pathological report, which is generated by another model." + ) + regional_lymph_node: list[ThyroidLN] | None = dspy.OutputField( + desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "left", "lymph_node_category": "level_vi_central", "involved": 2, "examined": 5, "station_name": "station 6"}, {"lymph_node_side": "right", "lymph_node_category": "level_ii", "involved": 0, "examined": 3, "station_name": "station 6"}, ...]. If not present, just output null for every lymph node""" + ) + extranodal_extension: bool | None = dspy.OutputField( + desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None" + ) + maximal_ln_size: int | None = dspy.OutputField( + desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None" + ) diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..64cee52 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +""" +pipeline.py +This script sets up a pipeline for processing pathology reports using large language models (LLMs). It includes functions for loading models, configuring the dspy library, and defining signatures for various cancer types. The pipeline is designed to extract structured information from pathology reports and convert it into JSON format. + +author: Kai-Po Chang @ Med NLP Lab, China Medical University +date: 2025-10-05 +""" + +__version__ = "0.1.0" +__date__ = "2025-10-05" +__author__ = ["Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" + +import dspy +from typing import Tuple +from pathlib import Path +import json +import time +from models.common import * +from models.lung import * +from models.colon import * +from models.prostate import * +from models.esophagus import * +from models.breast import * +from models.pancreas import * +from models.thyroid import * +from models.cervix import * +from models.liver import * +from models.stomach import * +from models.modellist import organmodels +from util.predictiondump import dump_prediction_plain +from modaic import PrecompiledAgent, PrecompiledConfig +import logging + + +def timeit(func): + """ + Decorator to time a function's execution. + """ + + def wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + # elapsed_str = (f"Execution time for {func.__name__}: {end_time - start_time:.4f} seconds") + # print(elapsed_str) + return result, end_time - start_time + + return wrapper + + +def setup_pipeline(model_name: str): + """ + Set up the pipeline by loading the specified model and configuring dspy. + + :param model_name: Name of the model to load + """ + autoconf_dspy(model_name) + print("Pipeline setup complete.") + + +class CancerPipelineConfig(PrecompiledConfig): + model: str = "ollama_chat/qwen3:30b" + api_base: str = localaddr + api_key: str = "" + model_type: str = "chat" + top_p: float = 0.7 + max_tokens: int = 16384 + num_ctx: int = 16384 + temperature: float = 0.7 + seed: int = 10 + + +class CancerPipeline(PrecompiledAgent): + config: CancerPipelineConfig + + def __init__(self, config: CancerPipelineConfig, **kwargs): + super().__init__(config, **kwargs) + self.analyzer_is_cancer = dspy.Predict(is_cancer) + self.jsonize = dspy.Predict(ReportJsonize) + + def forward(self, report: str, logger: logging.Logger, fname: str = "") -> dict: + """ + Process the full report to determine if it is a cancer excision and extract margins if applicable. + Args: + report (str): The pathology report to analyze. + """ + print(f"Processing report: {fname}") + logger.info(f"Processing report: {fname}") + paragraphs = report.split("\n\n") + paragraphs = [p.strip() for p in paragraphs if p.strip()] + context_response = self.analyzer_is_cancer(report=paragraphs) + if context_response.cancer_excision_report: + output_report = { + "cancer_excision_report": True, + "cancer_category": context_response.cancer_category, + "cancer_category_others_description": context_response.cancer_category_others_description, + "cancer_data": {}, + } + logger.info("This is a cancer excision report.") + if context_response.cancer_category == "others": + logger.info( + f"Cancer category is {context_response.cancer_category_others_description}, Currently not implemented." + ) + elif context_response.cancer_category: + logger.info(f"Cancer category is {context_response.cancer_category}.") + try: + json_response = self.jsonize( + report=paragraphs, cancer_category=context_response.cancer_category + ) + json_report = json_response.output + + except Exception as e: + json_report = {} + + for items in organmodels.get(context_response.cancer_category, []): + cls = globals().get(items) + if cls is None: + logger.error(f"Model class {items} not found.") + continue + logger.info( + f"Processing organ-specific model: {cls.__name__} at {time.strftime('%Y-%m-%d %H:%M:%S')} for {context_response.cancer_category} cancer for {fname}" + ) + organ_analyzer = dspy.Predict(cls) + try: + organ_response = organ_analyzer( + report=report, report_jsonized=json_report + ) + organ_data = dump_prediction_plain(organ_response) + output_report["cancer_data"].update(organ_data) + except Exception as e: + logger.error(f"Error processing {cls.__name__}: {e}") + continue + + return output_report + else: + # print("This is NOT a cancer excision report.") + logger.info("This is NOT a cancer excision report.") + output_report = { + "cancer_excision_report": False, + "cancer_category": None, + "cancer_data": {}, + } + print(json.dumps(output_report, indent=2, ensure_ascii=False)) + return output_report + + +@timeit +def run_pipeline(experiment_model: dspy.Module, **kwargs): + """ + Run the pipeline with the provided model and additional keyword arguments. + Args: + experiment_model (dspy.Predict): The model to run. + full_report (str): The full report to analyze. + """ + response = experiment_model(**kwargs, logger=logging.getLogger("experiment_logger")) + return response + + +def run_cancer_pipeline(report: str, fname: str = "") -> Tuple[dict, str]: + """ + Run the cancer pipeline on the provided report. + + :param report: The pathology report to analyze + :param fname: Optional filename for logging purposes + :return: Extracted structured data as a dictionary and timing string + """ + cancer_pipeline = CancerPipeline + response, timing = run_pipeline(cancer_pipeline, report=report, fname=fname) + return response, timing + + +if __name__ == "__main__": + # model_name = "qwen30b" # Example model name + # setup_pipeline(model_name) + print("Pipeline is ready for processing pathology reports.") + cancer_pipeline = CancerPipeline(CancerPipelineConfig()) + cancer_pipeline.push_to_hub("kblab/cancer-pipeline", with_code=True) + """ + data_dir = r'E:\workingcode\totalregistrar\dataset\7' + data_path = Path(data_dir).absolute() + + output_path = Path(r'E:\workingcode\experiment\20251005').absolute() / 'lung' + output_path.mkdir(parents=True, exist_ok=True) + + for file in data_path.glob('*.txt'): #pathlib.Path()物件列出路徑:用glob()或rglob()找pattern + with open(file, 'r', encoding='utf-8') as f: + rep = f.read() + print(f'文件名稱:{file.name}') + start_time = time.time() + cancer_pipeline = CancerPipeline() + response, timing = run_pipeline(cancer_pipeline, report=rep) + print('response:') + print(response) + """ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4a28653 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "cancer-pipeline" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.13" +dependencies = ["dspy>=3.0.4", "modaic>=0.4.1"] diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/util/predictiondump.py b/util/predictiondump.py new file mode 100644 index 0000000..b274c74 --- /dev/null +++ b/util/predictiondump.py @@ -0,0 +1,241 @@ +""" +predictiondump.py +~~~~~~~~~~~~~~~~~~~~~~ +This module provides functions to convert model predictions into a structured format +that can be easily serialized and saved for later analysis. + +Copyright 2025, Kai-Po Chang at Med NLP Lab, China Medical University, with aid from chatGPT. +""" + +__version__ = "0.1.0" +__date__ = "2025-10-05" +__author__ = ["Kai-Po Chang"] +__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University" +__license__ = "MIT" + +import json +from typing import Any, Iterable, Callable, Optional +from dataclasses import is_dataclass, asdict +from datetime import datetime, date +from decimal import Decimal +from collections.abc import Mapping + + +# --- helpers to make everything JSON-safe --- +def _to_json_safe(x: Any): + if isinstance(x, (datetime, date)): + return x.isoformat() + if isinstance(x, Decimal): + return float(x) + try: + import numpy as np # optional + + if isinstance(x, (np.integer,)): + return int(x) + if isinstance(x, (np.floating,)): + return float(x) + if isinstance(x, (np.ndarray,)): + return x.tolist() + except Exception: + pass + return x + + +# --- core recursive dumper --- + + +def dump_prediction_plain(pred) -> dict[str, Any]: + """Recursively convert a DSPy Prediction into a plain dict.""" + from dspy.primitives.prediction import Prediction + + def to_plain(obj: Any): + # Base primitives (return as-is) + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + + # Nested Prediction → dict + if isinstance(obj, Prediction): + return { + k: to_plain(v) + for k, v in obj._store.items() + if not k.startswith(("_lm_usage", "_inputs", "_completions")) + } + + # Mappings (dict-like) + if isinstance(obj, Mapping): + return { + k: to_plain(v) + for k, v in obj.items() + if not (isinstance(k, str) and k.startswith("_")) + } + + # Lists / tuples + if isinstance(obj, (list, tuple, set)): + return [to_plain(v) for v in obj] + + # Objects with .dict() or .to_dict() + for attr in ("dict", "to_dict"): + if hasattr(obj, attr) and callable(getattr(obj, attr)): + try: + return to_plain(getattr(obj, attr)()) + except Exception: + pass + + # Fallback: plain value + return obj + + result = to_plain(pred) + + # If the top level isn't a dict, wrap it so we always return one + return result if isinstance(result, dict) else {"value": result} + + +def dump_prediction( + obj: Any, + *, + exclude_private: bool = True, + exclude_keys: tuple[str, ...] = ("_lm_usage", "_inputs", "_completions"), + custom_predicate: Optional[Callable[[str, Any], bool]] = None, +) -> Any: + """ + Recursively convert a DSPy Prediction (or arbitrary nested structure) + into JSON-serializable Python types, excluding selected internal fields. + + - exclude_private=True removes any dict keys starting with "_". + - exclude_keys removes specific keys regardless of exclude_private. + - custom_predicate(key, value) -> bool can veto inclusion of a field. + """ + # Avoid circulars / trivial primitives + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + + # DSPy Prediction (or anything duck-typing a store-like interface) + try: + from dspy.primitives.prediction import Prediction # type: ignore + + is_prediction = isinstance(obj, Prediction) + except Exception: + is_prediction = False + + if is_prediction: + # Most DSPy predictions expose their data via an internal store. + store = getattr(obj, "_store", {}) or {} + out = {} + for k, v in store.items(): + if k in exclude_keys: + continue + if exclude_private and isinstance(k, str) and k.startswith("_"): + continue + if custom_predicate and not custom_predicate(k, v): + continue + out[k] = dump_prediction( + v, + exclude_private=exclude_private, + exclude_keys=exclude_keys, + custom_predicate=custom_predicate, + ) + return out + + # Pydantic (v2 or v1) objects + try: + from pydantic import BaseModel # type: ignore + + if isinstance(obj, BaseModel): + data = ( + obj.model_dump() # v2 + if hasattr(obj, "model_dump") + else obj.dict() # v1 + ) + return dump_prediction( + data, + exclude_private=exclude_private, + exclude_keys=exclude_keys, + custom_predicate=custom_predicate, + ) + except Exception: + pass + + # Dataclasses + if is_dataclass(obj): + return dump_prediction( + asdict(obj), + exclude_private=exclude_private, + exclude_keys=exclude_keys, + custom_predicate=custom_predicate, + ) + + # Mapping + if isinstance(obj, dict): + out = {} + for k, v in obj.items(): + if isinstance(k, str): + if k in exclude_keys: + continue + if exclude_private and k.startswith("_"): + continue + if custom_predicate and not custom_predicate(k, v): + continue + out[k] = dump_prediction( + v, + exclude_private=exclude_private, + exclude_keys=exclude_keys, + custom_predicate=custom_predicate, + ) + return out + + # Sequence + if isinstance(obj, (list, tuple, set)): + return [ + dump_prediction( + v, + exclude_private=exclude_private, + exclude_keys=exclude_keys, + custom_predicate=custom_predicate, + ) + for v in obj + ] + + # Objects with to_dict()/dict() + for meth in ("to_dict", "dict"): + if hasattr(obj, meth) and callable(getattr(obj, meth)): + try: + data = getattr(obj, meth)() + return dump_prediction( + data, + exclude_private=exclude_private, + exclude_keys=exclude_keys, + custom_predicate=custom_predicate, + ) + except Exception: + pass + + # Fallback to JSON-safe conversion (datetime, numpy, decimal, etc.) + return _to_json_safe(obj) + + +# --- combining many predictions into one JSON blob --- +def dump_many_predictions( + preds: Iterable[Any], + *, + key_fn: Optional[Callable[[Any, int], Optional[str]]] = None, + **dump_kwargs, +) -> str: + """ + Dump many predictions into a single JSON string. + + - By default returns a JSON array string. + - If key_fn is provided and returns a non-None key, we build a dict + mapping key -> dumped prediction. + - dump_kwargs are forwarded to dump_prediction (e.g., exclude_keys=...). + """ + dumped_list = [dump_prediction(p, **dump_kwargs) for p in preds] + + if key_fn: + mapping = {} + for i, (p, d) in enumerate(zip(preds, dumped_list)): + k = key_fn(p, i) + if k is not None: + mapping[k] = d + return json.dumps(mapping, ensure_ascii=False, indent=2) + + return json.dumps(dumped_list, ensure_ascii=False, indent=2)