(no commit message)

This commit is contained in:
2025-11-30 16:46:59 -05:00
parent 5abf88bf85
commit a91e871481
19 changed files with 3064 additions and 0 deletions

305
models/breast.py Normal file
View File

@@ -0,0 +1,305 @@
# -*- coding: utf-8 -*-
"""
models/breast.py
This script sets up a series of data extraction models using the dspy library for pathology reports, specifically focusing on breast cancer. It includes model loading, signature definitions for various cancer types, and functions to convert model predictions into structured JSON formats.
author: Hong-Kai (Walther) Chen, Po-Yen Tzeng and Kai-Po Chang @ Med NLP Lab, China Medical University
date: 2025-10-13
"""
__version__ = "1.0.0"
__date__ = "2025-10-13"
__author__ = ["Hong-Kai (Walther) Chen", "Po-Yen Tzeng", "Kai-Po Chang"]
__copyright__ = "Copyright 2025, Med NLP Lab, China Medical University"
__license__ = "MIT"
__ajcc_version__ = 8
__cap_version__ = "4.10.0.0"
import dspy
from typing import Literal
from pydantic import BaseModel, Field
class BreastMargin(BaseModel):
margin_category: (
Literal[
"12_3_clock",
"3_6_clock",
"6_9_clock",
"9_12_clock",
"12_clock",
"3_clock",
"6_clock",
"9_clock",
"superficial",
"base",
]
| None
) = Field(
None,
description="acceptable value for surgical margins in breast cancer. If not included in these standard margins, should be classified as others.",
)
margin_involved: bool
distance: int | None = Field(
None,
description="If margin is involved, return 0. If margin is uninvolved/free, try your best to find the distance at both microscopic and macroscopic(gross) description, and specify the distance from tumor to margin in mm, rounded to integer. If the margin is uninvolved/free and, after your best effort, the distance is still not specified, return null",
)
description: str | None
class BreastLN(BaseModel):
lymph_node_side: Literal["right", "left", "midline"] | None = Field(
None,
description="acceptable value for lymph node side in breast cancer. If not included in these standard sides, should be classified as None.",
)
lymph_node_category: Literal["sentinel", "nonsentinel", "others"] | None = Field(
None,
description="acceptable value for lymph node categories (i.e. stations) in breast cancer. If not included in these standard lymph node 'station' number, should be classified as others.",
)
involved: int
examined: int
station_name: str | None = Field(
None, description="specify the name of the lymph node station here."
)
class BreastBiomarker(BaseModel):
biomarker_category: Literal["er", "pr", "her2", "ki67", "others"] | None = Field(
None,
description="acceptable value for biomarker categories in breast cancer. If not included in these standard categories, should be classified as others.",
)
expression: bool | None = Field(
None,
description="specify whether or not the biomarker is expressed here.For Her-2 please refer to the score field, and don't fill in this field.",
)
percentage: int | None = Field(
None,
description="the percentage of tumor cells showing positive expression of the biomarker, rounded to integer. if not specified, return null",
)
score: Literal[0, 1, 2, 3] | None = Field(
None,
description="specify the Her-2 expression score, negative: score 0 or 1, equivocal: score 2, positive: score 3 of the biomarker here, if applicable.",
)
biomarker_name: str | None = Field(
None, description="specify the name of the biomarker here."
)
class BreastCancerNonnested(dspy.Signature):
"""you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
procedure: (
Literal[
"partial_mastectomy",
"simple_mastectomy",
"breast_conserving_surgery",
"modified_radical_mastectomy",
"total_mastectomy",
"wide_excision",
"others",
]
| None
) = dspy.OutputField(
desc="identify which surgery procedure was used. e.g. partial mastectomy"
)
cancer_quadrant: (
Literal[
"upper_outer_quadrant",
"upper_inner_quadrant",
"lower_outer_quadrant",
"lower_inner_quadrant",
"nipple",
"others",
]
| None
) = dspy.OutputField(
desc="identify the primary site of cancer. e.g. upper outer quadrant. please consider what side is the breast excision specimen when you determine the quadrant"
)
cancer_clock: Literal[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] | None = (
dspy.OutputField(
desc="identify the clock position of the cancer if mentioned. e.g. 3"
)
)
cancer_laterality: Literal["right", "left", "bilateral"] | None = dspy.OutputField(
desc="identify the side of the cancer. e.g. right"
)
histology: (
Literal[
"invasive_carcinoma_no_special_type",
"invasive_lobular_carcinoma",
"mixed_ductal_and_lobular_carcinoma",
"tubular_adenocarcinoma",
"mucinous_adenocarcinoma",
"encapsulated_papillary_carcinoma",
"solid_papillary_carcinoma",
"inflammatory_carcinoma",
"other_special_types",
]
| None
) = dspy.OutputField(
desc="identify the histological type of the cancer. e.g. invasive carcinoma of no special type. If the histological type is not included in the above list, please code as other_special_types and specify the histological type in the description field."
)
tumor_size: int | None = dspy.OutputField(
desc="identify the size of the tumor in mm, rounded, if multiple tumors are present, please provide the size of the largest tumor"
)
lymphovascular_invasion: bool | None = dspy.OutputField(
desc="check whether or not lymphovascular invasion is present"
)
perineural_invasion: bool | None = dspy.OutputField(
desc="check whether or not perineural invasion is present"
)
distant_metastasis: bool | None = dspy.OutputField(
desc="check whether or not distant metastasis is present"
)
treatment_effect: str | None = dspy.OutputField(
desc='check the treatment effect of the cancer. If you see "No known presurgical therapy", return None'
)
class DCIS(dspy.Signature):
"""
You need to extract data about ductal carcinoma in situ below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS.
"""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
dcis_present: bool | None = dspy.OutputField(
desc="check whether or not ductal carcinoma in situ is present"
)
dcis_size: int | None = dspy.OutputField(
desc="if ductal carcinoma in situ is present, identify the size of the largest DCIS in mm, rounded"
)
dcis_comedo_necrosis: bool | None = dspy.OutputField(
desc="if ductal carcinoma in situ is present, check whether or not comedo necrosis is present"
)
dcis_grade: Literal[1, 2, 3] | None = dspy.OutputField(
desc="if ductal carcinoma in situ is present, identify the grade of DCIS, low grade (grade 1), intermediate grade (grade 2), high grade (grade 3)"
)
class BreastCancerStaging(dspy.Signature):
"""you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
tnm_descriptor: Literal["y", "r", "m"] | None = dspy.OutputField(
desc='identify the tnm descriptor of the tumor. e.g., "y" (post-therapy), "r", etc.'
)
pt_category: (
Literal[
"tx", "tis", "t1mi", "t1a", "t1b", "t1c", "t2", "t3", "t4a", "t4b", "t4c"
]
| None
) = dspy.OutputField(desc="identify the pt category of the tumor")
pn_category: (
Literal[
"nx", "n0", "n1mi", "n1a", "n1b", "n1c", "n2a", "n2b", "n3a", "n3b", "n3c"
]
| None
) = dspy.OutputField(desc="identify the pn category of the tumor")
pm_category: Literal["mx", "m0", "m1"] | None = dspy.OutputField(
desc="identify the pm category of the tumor. if you see cM0 or cM1, etc., code as mx, since pathological M category is not available"
)
pathologic_stage_group: (
Literal["0", "ia", "ib", "iia", "iib", "iiia", "iiib", "iiic", "iv"] | None
) = dspy.OutputField(
desc="identify the pathologic stage group of the tumor, return none if only anatomical stage group is given, dont guess"
)
anatomic_stage_group: (
Literal["0", "ia", "ib", "iia", "iib", "iiia", "iiib", "iiic", "iv"] | None
) = dspy.OutputField(desc="identify the anatomic stage group of the tumor")
ajcc_version: int | None = dspy.OutputField(
desc="identify the ajcc version of the pathological staging"
)
class BreastCancerGrading(dspy.Signature):
"""you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
nuclear_grade: Literal[1, 2, 3] | None = dspy.OutputField(
desc="identify the nuclear grade of the tumor"
)
tubule_formation: Literal[1, 2, 3] | None = dspy.OutputField(
desc="identify the tubule formation score of the tumor"
)
mitotic_rate: Literal[1, 2, 3] | None = dspy.OutputField(
desc="identify the mitotic rate score of the tumor"
)
total_score: Literal[3, 4, 5, 6, 7, 8, 9] | None = dspy.OutputField(
desc="identify the total score of the tumor"
)
grade: Literal[1, 2, 3] | None = dspy.OutputField(
desc="identify the grade of the tumor"
)
class BreastCancerMargins(dspy.Signature):
"""you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
margins: list[BreastMargin] | None = dspy.OutputField(
desc="""return all of the possible involved margins and its distance from cancer. example:[{"margin_category": "proximal", "margin_involved": true, "distance": 5}, {"margin_category": "distal", "margin_involved": false, "distance": null}, {"margin_category": "radial", "margin_involved": false, "distance": null}]. If not present, just output null for every margin"""
)
class BreastCancerLN(dspy.Signature):
"""you need to extract the value of the specified items below from the given breast cancer excision report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
regional_lymph_node: list[BreastLN] | None = dspy.OutputField(
desc="""return all of the involved regional lymph node. example:[{"lymph_node_side": "right", "lymph_node_category": "sentinel", "involved": 2, "examined": 5, "station_name": "station 1"}, {"lymph_node_side": "left", "lymph_node_category": "nonsentinel", "involved": 0, "examined": 3, "station_name": "station 2"}, ...]. If not present, just output null for every lymph node"""
)
extranodal_extension: bool | None = dspy.OutputField(
desc="check whether or not extranodal extension is present; if no lymph node metastasis, should be None"
)
maximal_ln_size: int | None = dspy.OutputField(
desc="check the maximal size of node metastatic tumor in mm, rounded to integer; if no lymph node metastasis, should be None"
)
class BreastCancerBiomarkers(dspy.Signature):
"""you need to extract breast cancer biomarkers, which is very important, below from the given pathological cancer report. DO NOT JUST RETURN NULL. IF SOME ITEM IS NOT PRESENT, RETURN NULL FOR THAT ITEM, BUT TRY YOUR BEST TO FILL IN THE OTHERS."""
report: list = dspy.InputField(
desc="this is a pathological report for breast cancer excision, separated into paragraphs."
)
report_jsonized: dict = dspy.InputField(
desc="this is a roughly structured json summary of the pathological report, which is generated by another model."
)
biomarkers: list[BreastBiomarker] | None = dspy.OutputField(
desc="""return all of the examined immunoreceptors using immunohistochemistry techniques,like er, pr, her2, ki67, etc. in a list of pydantic structures,
example: [{"biomarker_category": "er", "expression": true, "percentage": 90, "score": null, "biomarker_name": "estrogen receptor"},
{"biomarker_category": "pr", "expression": false, "percentage": 0, "score": null, "biomarker_name": "progesterone receptor"},
{"biomarker_category": "her2", "expression": null, "percentage": 0, "score": 2, "biomarker_name": "human epidermal growth factor receptor 2"},
{"biomarker_category": "ki67", "expression": null, "percentage": 30, "score": null, "biomarker_name": "ki67 proliferation index"}]. If not present, just output null for every biomarker"""
)