From b7f546052ad4ec283cb4fd8a40e22a9f672b1fc8 Mon Sep 17 00:00:00 2001 From: Tyrin Todd Date: Sun, 18 Jan 2026 12:19:47 -0800 Subject: [PATCH] (no commit message) --- README.md | 2 -- config.json | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ program.json | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 config.json create mode 100644 program.json diff --git a/README.md b/README.md index 29385ea..e69de29 100644 --- a/README.md +++ b/README.md @@ -1,2 +0,0 @@ -# ppe-judge-gepa - diff --git a/config.json b/config.json new file mode 100644 index 0000000..0f12334 --- /dev/null +++ b/config.json @@ -0,0 +1,59 @@ +{ + "model": null, + "signature": { + "description": " \"Task: Evaluate and compare the quality of two responses (Response A and Response B) given a specific question (input). Determine which response better addresses the question by focusing on factual correctness, completeness, and adherence to any specific requirements mentioned in the question prompt. Select the response that provides the most accurate, comprehensive, and relevant solution or explanation to the problem presented. Document your decision in the format \"A>B\" if Response A is the better choice, or \"B>A\" if Response B is superior.\n\nDetailed Instructions:\n\n1. **Understand the Question Context:**\n - Ensure you comprehend the full context and requirements specified by the question or problem statement. This includes understanding any specific instructions such as formats, calculations, algorithms, or methodologies that are mentioned.\n - Note any domain-specific terminologies or conditions, such as units of measurement or specific constants that need to be included in calculations or explanations.\n\n2. **Evaluate Each Response:**\n - Check for factual accuracy in the content, calculations, or recommendations provided in each response.\n - Assess the response for completeness\u2014whether it completely addresses all aspects of the question.\n - Verify the adherence of each response to the specified question requirements, such as avoiding certain methods (e.g., no saving as CSV) or changing visualization formats.\n - Consider clarity and structure of the explanation or solution provided.\n\n3. **Factual and Domain-Specific Considerations:**\n - When evaluating technical or scientific information (e.g., coding, mathematical problems, chemistry), use established domain knowledge to verify calculations, logic, or proposed solutions.\n - Take note of domain-specific constants or equations omitted in the response that might be critical.\n\n4. **Decision Making:**\n - Determine which response (A or B) best meets the above criteria.\n - Use a systematic approach to compare responses, focusing on differences that impact the quality of the solution or explanation.\n - Select the response that is not only correct but also most aligns with the question\u2019s specific requirements and limitations.\n\n5. **Output Your Conclusion:**\n - Once you have determined which response is better, output your decision in the required format: \"A>B\" if Response A is better, or \"B>A\" if Response B is better.\"", + "properties": { + "question": { + "__dspy_field_type": "input", + "desc": "The original question or prompt", + "prefix": "Question:", + "title": "Question", + "type": "string" + }, + "response_A": { + "__dspy_field_type": "input", + "desc": "First response to evaluate", + "prefix": "Response A:", + "title": "Response A", + "type": "string" + }, + "response_B": { + "__dspy_field_type": "input", + "desc": "Second response to evaluate", + "prefix": "Response B:", + "title": "Response B", + "type": "string" + }, + "label": { + "__dspy_field_type": "output", + "desc": "Which response is better: 'A>B' or 'B>A'", + "enum": [ + "A>B", + "B>A" + ], + "prefix": "Label:", + "title": "Label", + "type": "string" + } + }, + "required": [ + "question", + "response_A", + "response_B", + "label" + ], + "title": "JudgeSignature", + "type": "object" + }, + "lm": { + "model": "huggingface/together/Qwen/Qwen3-VL-32B-Instruct", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": null, + "max_tokens": null + } +} \ No newline at end of file diff --git a/program.json b/program.json new file mode 100644 index 0000000..24226b5 --- /dev/null +++ b/program.json @@ -0,0 +1,46 @@ +{ + "predictor": { + "traces": [], + "train": [], + "demos": [], + "signature": { + "instructions": " \"Task: Evaluate and compare the quality of two responses (Response A and Response B) given a specific question (input). Determine which response better addresses the question by focusing on factual correctness, completeness, and adherence to any specific requirements mentioned in the question prompt. Select the response that provides the most accurate, comprehensive, and relevant solution or explanation to the problem presented. Document your decision in the format \"A>B\" if Response A is the better choice, or \"B>A\" if Response B is superior.\n\nDetailed Instructions:\n\n1. **Understand the Question Context:**\n - Ensure you comprehend the full context and requirements specified by the question or problem statement. This includes understanding any specific instructions such as formats, calculations, algorithms, or methodologies that are mentioned.\n - Note any domain-specific terminologies or conditions, such as units of measurement or specific constants that need to be included in calculations or explanations.\n\n2. **Evaluate Each Response:**\n - Check for factual accuracy in the content, calculations, or recommendations provided in each response.\n - Assess the response for completeness\u2014whether it completely addresses all aspects of the question.\n - Verify the adherence of each response to the specified question requirements, such as avoiding certain methods (e.g., no saving as CSV) or changing visualization formats.\n - Consider clarity and structure of the explanation or solution provided.\n\n3. **Factual and Domain-Specific Considerations:**\n - When evaluating technical or scientific information (e.g., coding, mathematical problems, chemistry), use established domain knowledge to verify calculations, logic, or proposed solutions.\n - Take note of domain-specific constants or equations omitted in the response that might be critical.\n\n4. **Decision Making:**\n - Determine which response (A or B) best meets the above criteria.\n - Use a systematic approach to compare responses, focusing on differences that impact the quality of the solution or explanation.\n - Select the response that is not only correct but also most aligns with the question\u2019s specific requirements and limitations.\n\n5. **Output Your Conclusion:**\n - Once you have determined which response is better, output your decision in the required format: \"A>B\" if Response A is better, or \"B>A\" if Response B is better.\"", + "fields": [ + { + "prefix": "Question:", + "description": "The original question or prompt" + }, + { + "prefix": "Response A:", + "description": "First response to evaluate" + }, + { + "prefix": "Response B:", + "description": "Second response to evaluate" + }, + { + "prefix": "Label:", + "description": "Which response is better: 'A>B' or 'B>A'" + } + ] + }, + "lm": { + "model": "huggingface/together/Qwen/Qwen3-VL-32B-Instruct", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": null, + "max_tokens": null + } + }, + "metadata": { + "dependency_versions": { + "python": "3.11", + "dspy": "3.1.0", + "cloudpickle": "3.1" + } + } +} \ No newline at end of file