{
  "version": "1.0",
  "name": "FinancialTouchstone Evaluation Prompts",
  "description": "Official prompts for evaluating AI models on the FinancialTouchstone benchmark",
  "created_by": {
    "organization": "Chair of Data Science & Natural Language Processing (DS-NLP)",
    "institution": "Institute of Computer Science, University of St. Gallen"
  },
  "question_prompts": {
    "base": {
      "name": "Base Question Prompt",
      "description": "Core instructions for all question types",
      "prompt": "You are a financial analyst with a strict instruction: answer the user's question based *only* on the provided context below.\n\n**CRITICAL RULES:**\n1. **Ground all claims in the provided text.** Do not use any external knowledge.\n2. **Do not invent, calculate, or estimate any figures or metrics** (e.g., totals, percentages, Free Cash Flow) if they are not explicitly stated in the context.\n3. **If the context does not contain the answer, you MUST state \"The provided context does not contain this information.\"** and stop. Do not try to infer the answer.\n4. **Quote directly when possible** or cite the information accurately.\n5. **You can coarsely round the numbers. No exact numbers need to be provided. For example: 17'777m can be rounded to 18bn if convenient for you.**\n\n## Question\n\n{question}\n\n=== CONTEXT START ===\n{rag_context}\n=== CONTEXT END ==="
    },
    "key_financials": {
      "name": "Key Financials",
      "description": "Extract key financial metrics such as net income, EBITDA, and key financial ratios",
      "prompt": "### Question Key Financials: Task Definition and Definition of What Constitutes Key Financials\n\nExtract key financial metrics such as net income, EBITDA, and key financial ratios.\n\nOther accepted financial KPIs are:\n\nReport those numbers that are relevant to the company at hand. Not all of the mentioned KPIs may always be reported by the companies.\n\nProvide specific numbers and percentages.\n\n### Desired Accuracy for This Question\n\nYou will be evaluated on whether you found most of the figures from the golden source.\n\nNo 100% overlap with the golden source is required. Three of the facts from the golden source should have been correctly identified by you.\n\nIt is still best for you to find as many figures as possible to maximize the chance of hitting the desired ones."
    },
    "cash_flow": {
      "name": "Cash Flow",
      "description": "Extract cash flow figures from financial statements",
      "prompt": "### Question: Cash Flow\n\nExtract the following cash flow figures for the current year, provide the value exactly as stated.\n\n- Cash flows from all reported activities\n- Free cash flow\n- Total cash flow (Net increase/decrease in cash)"
    },
    "revenue": {
      "name": "Revenue",
      "description": "Find and extract company revenue with industry-specific terminology",
      "prompt": "### Question: Current Revenue\n\nFind and extract the revenue of the company.\n\nIf the company is in certain industries such as banking or hospitality, the name of the line item may not be revenue.\n\n## Synonyms for \"Revenue\" in Different Industries\n\n* Premiums Earned or Net Premiums Earned: This is the insurance equivalent of revenue\n* Gross Premiums Written: Total premiums collected (before reinsurance)\n* Net Premiums Written: Premiums after reinsurance costs\n* Investment Income: Secondary revenue from investing premium reserves\n* The \"net\" concept is similar to banks - they report premiums net of reinsurance costs\n* Net sales"
    },
    "revenue_growth": {
      "name": "Revenue Growth",
      "description": "Analyze year-over-year revenue growth",
      "prompt": "### Question: Revenue Growth\n\nAnalyze revenue growth for the current year vs. the previous year.\n\nInclude the reported revenue growth rate. Alternatively, provide the current revenue and the revenue of the year.\n\nIf you find multiple revenue numbers such as \"organic revenue growth,\" it is also sufficient if you only report one of them.\n\nProvide specific percentages or provide the revenues for the current and for the prior year if no percentage is given."
    },
    "segments": {
      "name": "Business Segments",
      "description": "Identify and describe company business segments",
      "prompt": "### Question: Segments\n\nIdentify and briefly describe the company's business segments.\n\nProvide specific numbers and growth rates."
    },
    "company_type": {
      "name": "Company Type / Legal Form",
      "description": "Determine the legal form and corporate structure",
      "prompt": "### Question: Legal Form\n\nBased ONLY on the provided context, determine the legal form and structure of the company (e.g., Inc., LLC, GmbH, AG, etc.).\n\nInclude information about the company's incorporation, jurisdiction, and corporate structure.\n\nDo not mention \"Inc.\" if \"Incorporated\" or \"Inc\" are not explicitly named. If \"Co,\" \"Corp,\" or \"Corporation\" are the only things mentioned, use this. Do not change the abbreviation. Just take exactly what is mentioned in the source.\n\nThe evaluator will be lenient here, and will allow you to answer \"Co\" even if \"Corporation\" was mentioned in the text, but please still stick closely to the context here."
    }
  },
  "grading_prompts": {
    "base": {
      "name": "Base Grading Prompt",
      "description": "Core evaluation principles and procedure for LLM-based grading",
      "principles": {
        "source_of_truth": "The Retrieved Context is the Single Source of Truth. Base entire evaluation on Retrieved Context only.",
        "hallucination_definition": "A hallucination is any factual claim in the Model's Answer that cannot be verified using the Retrieved Context. Extra information that IS present in the context is NOT a hallucination.",
        "correctness_definition": "An answer is correct if: (1) Every verifiable claim is factually accurate according to Retrieved Context, and (2) The answer successfully addresses the core requirement of the Golden Answer."
      },
      "evaluation_procedure": [
        "Step 1: Hallucination Verification - Check each factual claim against the Retrieved Context",
        "Step 2: Correctness Assessment - Check for factual errors, then check relevance and completeness",
        "Step 3: If incorrect, assess whether the retriever was insufficient (missing information)"
      ],
      "output_format": {
        "answer_correct": "yes/no",
        "hallucination_present": "yes/no",
        "retriever_insufficient": "yes/no",
        "explanation": "detailed explanation"
      }
    },
    "default": {
      "name": "Default Grading Mode",
      "description": "Standard evaluation against gold standard",
      "prompt": "### Task Details\n\nPlease evaluate the model answer against the gold standard using the criteria above."
    },
    "strict": {
      "name": "Strict Grading Mode",
      "description": "Rigorous evaluation with high accuracy requirements",
      "criteria": [
        "Numerical accuracy (even small discrepancies should be penalized)",
        "Completeness (missing any element from gold standard reduces score)",
        "No hallucinations or unsupported claims",
        "Proper financial terminology"
      ],
      "output_format": {
        "scores": {
          "accuracy": "integer_score",
          "completeness": "integer_score",
          "relevance": "integer_score",
          "clarity": "integer_score"
        },
        "overall_score": "weighted_average_float",
        "overall_feedback": "summary",
        "critical_errors": "list any major errors or omissions"
      }
    },
    "lenient": {
      "name": "Lenient Grading Mode",
      "description": "Flexible evaluation focusing on core concepts",
      "criteria": [
        "General correctness of concepts",
        "Reasonable approximations are acceptable",
        "Credit for partial understanding",
        "Focus on the main points rather than minor details"
      ],
      "output_format": {
        "scores": {
          "accuracy": "integer_score",
          "completeness": "integer_score",
          "relevance": "integer_score",
          "clarity": "integer_score"
        },
        "overall_score": "weighted_average_float",
        "overall_feedback": "summary",
        "strengths": "list key strengths"
      }
    }
  },
  "usage_notes": [
    "Combine base prompt with question-specific prompt for complete task instructions",
    "Use grading prompts with an LLM to evaluate model responses",
    "The default grading mode is recommended for standard benchmark evaluation",
    "Strict mode is useful for detailed error analysis",
    "Lenient mode is useful for understanding model capabilities"
  ]
}