diff --git a/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_CUSTOM_METRIC_HERE/custom-llm-metric.json b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_CUSTOM_METRIC_HERE/custom-llm-metric.json new file mode 100644 index 000000000..fbce301d6 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_CUSTOM_METRIC_HERE/custom-llm-metric.json @@ -0,0 +1,34 @@ +{ + "createdAt": "2025-08-18 09:38:01.990700", + "name": "groundedness", + "scenario": "genai-evaluations", + "version": "0.0.1", + "evaluationMethod": "llm-as-a-judge", + "metricType": "evaluation", + "managedBy": "imperative", + "systemPredefined": false, + "spec": { + "promptType": "free-form", + "configuration": { + "modelConfiguration": { + "name": "gpt-4o", + "version": "2024-08-06", + "parameters": [ + { + "key": "temperature", + "value": "0.1" + }, + { + "key": "max_tokens", + "value": "110" + } + ] + }, + "promptConfiguration": { + "systemPrompt": "You should strictly follow the instruction given to you. Please act as an impartial judge and evaluate the quality of the responses based on the prompt and following criteria:", + "userPrompt": "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with a reference and an AI-generated response. You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the criteria provided in the Evaluation section below. You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.\n\n## Metric Definition\nYou are an INFORMATION OVERLAP classifier providing the overlap of information between a response and reference.\n\n## Criteria\nGroundedness: The of information between a response generated by AI models and provided reference.\n\n## Rating Rubric\n5: (Fully grounded). The response and the reference are fully overlapped.\n4: (Mostly grounded). The response and the reference are mostly overlapped.\n3: (Somewhat grounded). The response and the reference are somewhat overlapped.\n2: (Poorly grounded). The response and the reference are slightly overlapped.\n1: (Not grounded). There is no overlap between the response and the reference.\n\n## Evaluation Steps\nSTEP 1: Assess the response in aspects of Groundedness. Identify any information in the response and provide assessment according to the Criteria.\nSTEP 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering Groundedness.\n\nReference: {{?reference}}\nResponse: {{?aicore_llm_completion}}\n\nBegin your evaluation by providing a short explanation. Be as unbiased as possible. After providing your explanation, please rate the response according to the rubric and outputs STRICTLY following this JSON format:\n\n{ \"explanation\": string, \"rating\": integer }\n\nOutput:\n", + "dataType": "numeric" + } + } + } +} \ No newline at end of file diff --git a/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_CUSTOM_METRIC_HERE/custom-llm-metric.jsonl b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_CUSTOM_METRIC_HERE/custom-llm-metric.jsonl new file mode 100644 index 000000000..c73e47a55 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_CUSTOM_METRIC_HERE/custom-llm-metric.jsonl @@ -0,0 +1 @@ +{"createdAt":"2025-08-18 09:38:01.990700","name":"groundedness","scenario":"genai-evaluations","version":"0.1.6","evaluationMethod":"llm-as-a-judge", "metricType":"evaluation", "managedBy":"imperative","systemPredefined":false,"spec":{"promptType":"free-form","configuration":{"modelConfiguration":{"name":"gpt-4o","version":"2024-08-06","parameters":[{"key":"temperature","value":"0.1"},{"key":"max_tokens","value":"110"}]},"promptConfiguration":{"systemPrompt":"You should strictly follow the instruction given to you. Please act as an impartial judge and evaluate the quality of the responses based on the prompt and following criteria:","userPrompt":"You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with a reference and an AI-generated response. You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the criteria provided in the Evaluation section below. You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.\n\n## Metric Definition\nYou are an INFORMATION OVERLAP classifier providing the overlap of information between a response and reference.\n\n## Criteria\nGroundedness: The of information between a response generated by AI models and provided reference.\n\n## Rating Rubric\n5: (Fully grounded). The response and the reference are fully overlapped.\n4: (Mostly grounded). The response and the reference are mostly overlapped.\n3: (Somewhat grounded). The response and the reference are somewhat overlapped.\n2: (Poorly grounded). The response and the reference are slightly overlapped.\n1: (Not grounded). There is no overlap between the response and the reference.\n\n## Evaluation Steps\nSTEP 1: Assess the response in aspects of Groundedness. Identify any information in the response and provide assessment according to the Criteria.\nSTEP 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering Groundedness.\n\nReference: {{?reference}}\nResponse: {{?aicore_llm_completion}}\n\nBegin your evaluation by providing a short explanation. Be as unbiased as possible. After providing your explanation, please rate the response according to the rubric and outputs STRICTLY following this JSON format:\n\n{ \"explanation\": string, \"rating\": integer }\n\nOutput:\n","dataType":"numeric"}}}} \ No newline at end of file diff --git a/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_DATASET_HERE/medicalqna_dataset.csv b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_DATASET_HERE/medicalqna_dataset.csv new file mode 100644 index 000000000..21ad421d1 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_DATASET_HERE/medicalqna_dataset.csv @@ -0,0 +1,70 @@ +question,sentiment,reference +how does rivatigmine and otc sleep medicine interact,Interaction,"tell your doctor and pharmacist what prescription and nonprescription medications, vitamins, nutritional supplements, and herbal products you are taking or plan to take. Be sure to mention any of the following: antihistamines; aspirin and other nonsteroidal anti-inflammatory medications (NSAIDs) such as ibuprofen (Advil, Motrin) and naproxen (Aleve, Naprosyn); bethanechol (Duvoid, Urecholine); ipratropium (Atrovent, in Combivent, DuoNeb); and medications for Alzheimer's disease, glaucoma, irritable bowel disease, motion sickness, ulcers, or urinary problems. Your doctor may need to change the doses of your medications or monitor you carefully for side effects." +how does valium affect the brain,Action,"Diazepam is a benzodiazepine that exerts anxiolytic, sedative, muscle-relaxant, anticonvulsant and amnestic effects. Most of these effects are thought to result from a facilitation of the action of gamma aminobutyric acid (GABA), an inhibitory neurotransmitter in the central nervous system." +what is morphine,Information,Morphine is a pain medication of the opiate family which is found naturally in a number of plants and animals.[5][7] It acts directly on the central nervous system (CNS) to decrease the feeling of pain. +what are the milligrams for oxycodone e,Dose,� 10 mg � 20 mg � 40 mg � 80 mg ... +81% aspirin contain resin and shellac in it. ?,Ingredient,Inactive Ingredients Ingredient Name +what is desonide ointment used for,Indication,"Desonide is used to treat the redness, swelling, itching, and discomfort of various skin conditions, including psoriasis (a skin disease in which red, scaly patches form on some areas of the body and eczema (a skin disease that causes the skin to be dry and itchy and to sometimes develop red, scaly rashes)." +how soon can tylenol be taken after a cocktail?,Interaction,"According to the National Health Service (NHS) in the UK, it is usually safe to drink a small amount of alcohol while taking this pain reliever. ... However, when people take acetaminophen at high doses or together with alcohol, it can cause side effects ranging from minor to severe, with the possibility of fatal liver damage. This risk may be higher for people with alcohol use disorder (AUD), which was previously known as alcoholism.... According to the U.S. National Library of Medicine, taking acetaminophen can be dangerous for people who regularly drink alcohol. Manufacturers currently recommend that people who have more than 3 alcoholic drinks per day should ask their doctor before taking acetaminophen." +breo inhaler how it works,Action,"The combination of fluticasone and vilanterol is used to control wheezing, shortness of breath, coughing, and chest tightness caused by asthma and chronic obstructive pulmonary (COPD; a group of diseases that affect the lungs and airways, that includes chronic bronchitis and emphysema). Fluticasone is in a class of medications called steroids. It works by reducing swelling in the airways. Vilanterol is in a class of medications called long-acting beta-agonists (LABAs). It works by relaxing and opening air passages in the lungs, making it easier to breathe." +breo inhaler how it works,Usage,"To use the inhaler, follow these steps: + 1 If you will be using a new inhaler for the first time, remove it from the box and the foil wrapper. Fill in the ""Tray opened"" and ""Discard"" blanks on the inhaler label with the date that you opened the pouch and the date 6 weeks later when you must replace the inhaler. + 2 When you are ready to inhale your dose, slide the cover down to expose the mouthpiece until it clicks. If you open and close the inhaler without using your dose, you will waste the medication. + 3 The counter will count down by 1 each time you open the cover. If the counter does not count down, your inhaler will not provide the medicine. If your inhaler does not count down, call your pharmacist or doctor. + 4 Hold the inhaler away from your mouth and breathe out as far as you comfortably can. Do not breathe out into the mouthpiece. + 5 Put the mouthpiece between your lips, and close your lips firmly around it. Take a long, steady, deep breath in through your mouth. Do not breathe in through your nose. Be careful not block the air vent with your fingers. + 6 Remove the inhaler from your mouth, and hold your breath for about 3 to 4 seconds or as long as you comfortably can. Breathe out slowly. + 7 You may or may not taste or feel the medicine released by the inhaler. Even if you do not, do not inhale another dose. If you are not sure you are getting your dose of fluticasone and vilanterol, call your doctor or pharmacist. + 8 You may clean the mouthpiece with a dry tissue, if needed. Slide the cover up over the mouthpiece as far as it will go to close the inhaler. + 9 Rinse your mouth with water, but do not swallow. +Ask your pharmacist or doctor for a copy of the manufacturer's information for the patient." +qvar 40mg what is it for,Indication,"QVAR is indicated in the maintenance treatment of asthma as prophylactic therapy in patients 5 years of age and older. QVAR is also indicated for asthma patients who require systemic corticosteroid administration, where adding QVAR may reduce or eliminate the need for the systemic corticosteroids." +does cyclosporine ophthalmic helps for iritis?,Indication,This study showed improvement of recurrent anterior uveitis [iritis] in patients while on conventional treatment with cyclosporine A 0.05% compared with conventional treatment alone. +what ingredient in walnut interferes with synthroid drug absorption,Interaction,"Dietary fiber: Certain dietary fiber sources can impede absorption of the thyroid hormone replacement medication. Mayo Clinic staff say it is best to avoid dietary fiber in foods like walnuts, soy products, iron supplements and multivitamins containing iron." +what is the color of the fluvaastatin pill,Appearance,Product Characteristics Color RED (rust) +"is penicillin in the pill ""montelukast?""",Ingredient,"What are the ingredients in montelukast sodium tablets? + +Active ingredient: montelukast sodium, USP + +Inactive ingredients: + +10 mg tablet: croscarmellose sodium, hydroxypropyl cellulose, lactose monohydrate, magnesium stearate, and microcrystalline cellulose. The film coating contains: black iron oxide, hydroxypropyl cellulose, hypromellose, red iron oxide, titanium dioxide, and yellow iron oxide." +"can i take metamucil with ""ciprofloxacin?""",Interaction,"diarrhea is a common problem caused by antibiotics which usually ends when the antibiotic is discontinued. Sometimes after starting treatment with antibiotics, patients can develop watery and bloody stools (with or without stomach cramps and fever) even as late as two or more months after having taken the last dose of the antibiotic. If this occurs, patients should contact their physician as soon as possible.�" +how long before a meal should lansoprazole be taken,Usage,Swallow 1 capsule with a glass of water before eating in the morning. +what does using fluorouracil make your face look like,Side effects,"The most frequent adverse reactions to Fluorouracil 5% Topical Cream occur locally and are often related to an extension of the pharmacological activity of the drug. These include burning, crusting, allergic contact dermatitis, erosions, erythema, hyperpigmentation, irritation, pain, photosensitivity, pruritus, scarring, rash, soreness and ulceration." +why did my doctor give me level iracetam,Indication,Levetiracetam is used in combination with other medications to treat certain types of seizures in adults and children with epilepsy. Levetiracetam is in a class of medications called anticonvulsants. It works by decreasing abnormal excitement in the brain. +results of stopping terazosin?,Usage,"The effect of withdrawal of terazosin therapy in patients with mild to moderate hypertension was assessed in two double-blind, placebo-controlled studies. All patients had demonstrated a stable blood pressure response to terazosin prior to withdrawal of the drug. Patients were randomly assigned either to continue treatment with terazosin at a previously established dose that had brought blood pressure under control (dose range: 1 to 40 mg daily) or to receive a matching placebo. At the end of a six- or eight-week withdrawal period, placebo-treated patients experienced mean increases of 7.3 and 12.4 mm Hg in supine diastolic blood pressure (studies M81-020 and M81-028 site 1, respectively). These increases were significantly greater than those observed for patients who continued to receive terazosin. Similar results were observed in other blood pressure variables. Withdrawal of terazosin was accompanied by a significant weight loss (2.8 and 3.6 pounds in studies M81-020 and M81-028, respectively). There were no clinically significant changes in pulse rates, physical examinations, laboratory test results, or electrocardiograms. Headache was the most common adverse experience reported by those who received placebo during the drug withdrawal period. These studies demonstrate that withdrawal of terazosin therapy is associated with an increase in supine diastolic blood pressure, often to hypertensive levels, without signs of a withdrawal syndrome." +what meloxicam look like,Appearance,Product Characteristics Color YELLOW (light yellow) Score no score Shape OVAL Size 3mm Imprint Code S160 +nitroglycerin how often,Usage,"One tablet should be dissolved under the tongue or in the buccal pouch at the first sign of an acute anginal attack. The dose may be repeated approximately every 5 minutes until relief is obtained. If the pain persists after a total of 3 tablets in a 15-minute period, or if the pain is different than is typically experienced, prompt medical attention is recommended. Nitroglycerin may be used prophylactically 5 to 10 minutes prior to engaging in activities that might precipitate an acute attack." +whate is vitamin c chemicl symple ?,Information,Active Ingredient/Active Moiety ... ASCORBIC ACID ... +what is the maximum dose of pregabalin,Dose,"In view of the dose-dependent adverse reactions, treatment with doses above 300 mg/day is not recommended" +how long does marijuana it stay in system,Action/time,"The effects of marijuana usually last from 1 to 3 hours, but marijuana can stay in the body for days or even weeks after use. Organs in the body have fatty tissues that absorb the THC in marijuana. In general, standard urine tests can detect THC several days after use. In people who use heavily, however, urine tests can sometimes detect THC for several weeks." +neupro and ropinirole when is it safe to take,Interaction,"Anxiolytics; Sedatives; and Hypnotics: (Moderate) A reduction in the dose of anxiolytics, sedatives, hypnotics and concomitantly administered dopamine agonists with sedative properties (e.g., ropinirole, pramipexole, rotigotine, apomorphine) should be considered to minimize additive sedative effects. In addition, the risk of next-day psychomotor impairment is increased during co-administration, which may decrease the ability to perform tasks requiring full mental alertness such as driving." +neupro and ropinirole when is it safe to take,Comparison,"Switching from oral dopamine agonists to rotigotine: An open-label study of 99 subjects with Parkinson�s disease was conducted in which the subjects, previously treated with 3 to 12mg/day ropinirole with or without levodopa, were converted to treatment with transdermal rotigotine. The following dosage conversion was utilized; 3mg/day ropinirole to 2mg/24 hours rotigotine, 6mg/day ropinirole to 4mg/24 hours rotigotine, 8-9mg/day ropinirole to 6mg/24 hours rotigotine, 12mg/day ropinirole to 8mg/24 hours rotigotine. Patients were instructed to take their last dose of ropinirole in the afternoon or evening, applying a rotigotine patch the next morning upon awakening. Overall this study determined that an overnight switch from ropinirole to rotigotine was generally well tolerated without loss of efficacy." +what is prevnar >65,Information,The pneumococcal conjugate vaccine (PCV13 or Prevnar 13�) protects against 13 types of pneumococcal bacteria. CDC recommends PCV13 for use in infants and young children and adults 65 years or older. +how many mg does it take to overdose on oxycodone,Overdose,"OXYCODONE HCl CONTROLLED-RELEASE 80 mg and 160 mg Tablets, or a single dose greater than 40 mg, ARE FOR USE IN OPIOID-TOLERANT PATIENTS ONLY. A single dose greater than 40 mg, or total daily doses greater than 80 mg, may cause fatal respiratory depression when administered to patients who are not tolerant to the respiratory depressant effects of opioids." +what medication not to take with lithium,Interaction,What special precautions should I follow? +mst drug/?,Information,"MST�Continus� 5 mg, 10 mg, 15 mg, 30 mg, 60 mg, 100 mg and 200 mg prolonged release tablets: Morphine sulfate" +what size doses of metformin are available?,Dose,"Metformin Hydrochloride Tablets, USP ... 500 mg ... 850 mg ... 1000 mg" +"pravastatin s9 orange how many ""grams?�""",Dose,No answers +how long morphine remains in body,Action/time,"Morphine takes longer to work than heroin and the effects tend to last longer. Despite this, blood tests can only detect morphine for the first 12 hours after the last dose, and urine tests only work for up to 3 days. However, saliva tests are more effective, being able to detect traces of morphine for up to 4 days. Again, morphine stays in the hair for 90 days." +"what is the imprint on metoprolol succ., 50 mg",Appearance,"50 mg tablets: White, round, coated tablets debossed with Andrx logo and �831� on one side and scored on the other side." +what can take the place of tramadol,Alternatives,"The American Academy of Pediatrics (AAP) and other pediatric associations and academies have released guidelines on the management of nociceptive pain in children. The top 3 medications� recommendations in children are paracetamol, ibuprofen, and opioids: non-opioids for mild nociceptive pain; non-opioids + weak opioids for moderate nociceptive pain and non-opioids + strong opioids for severe nociceptive pain. Codeine and tramadol are the only two opioids classified as weak opioids. In most countries, they do not require a restricted medical drug prescription and as �weak� opioids, they are often considered to have a lower potential for adverse drug reactions (ADR) than �strong� opioids." +how to administer denosumab,Usage,"Denosumab injection comes as a solution (liquid) to be injected subcutaneously (under the skin) in your upper arm, upper thigh, or stomach area. It is usually injected by a doctor or nurse in a medical office or clinic. Denosumab injection (Prolia) is usually given once every 6 months. When denosumab injection (Xgeva) is used to reduce the risk of fractures from multiple myeloma, or cancer that has spread to the bones, it is usually given once every 4 weeks. When denosumab injection (Xgeva) is used to treat giant cell tumor of bone, or high calcium levels caused by cancer, it is usually given every 7 days for the first three doses (on day 1, day 8, and day 15) and then once every 4 weeks starting 2 weeks after the first three doses. + +Your doctor will tell you to take supplements of calcium and vitamin D while you are being treated with denosumab injection. Take these supplements exactly as directed. + +When denosumab injection (Prolia) is used to treat osteoporosis or bone loss, your doctor or pharmacist will give you the manufacturer's patient information sheet (Medication Guide) when you begin treatment with denosumab injection and each time you refill your prescription. Read the information carefully and ask your doctor or pharmacist if you have any questions. You can also visit the Food and Drug Administration (FDA) website (http://www.fda.gov/Drugs/DrugSafety/ucm085729.htm) or the manufacturer's website to obtain the Medication Guide." +what is barbiturates,Information,"Barbiturates are sedative-hypnotic drugs that were once commonly used as sedatives or antianxiety medications. A physician must prescribe barbiturates; otherwise, their use is considered illicit. Among their limited uses, barbiturates are used to manage some seizure disorders as well as for pre-procedural sedation. In rarer instances, they are prescribed for the treatment of headache, anxiety and insomnia. However, their use in most areas of medicine has largely been supplanted by other safer medications. Barbiturates are controlled substances due to the potential they pose for abuse, physical dependence, and addiction. Some of the more common barbiturates include Luminal (phenobarbital). Brevital (methohexital). Seconal (secobarbital). Butisol (butabarbital). Fiorinal (butalbital)." +what are the inactive ingredients to the pneumonia vaccine,Ingredient,Inactive Ingredients POLYSORBATE 80 � ALUMINUM PHOSPHATE +how to prep and administer insulin,Usage,"Humulin R U-100 may be administered by subcutaneous injection in the abdominal wall, the thigh, the gluteal region or in the upper arm. Subcutaneous injection into the abdominal wall ensures a faster absorption than from other injection sites. Injection into a lifted skin fold minimizes the risk of intramuscular injection. Injection sites should be rotated within the same region. As with all insulin, the duration of action will vary according to the dose, injection site, blood flow, temperature, and level of physical activity. Intravenous administration of Humulin R U-100 is possible under medical supervision with close monitoring of blood glucose and potassium levels to avoid hypoglycemia and hypokalemia. For intravenous use, Humulin R U-100 should be used at concentrations from 0.1 unit/mL to 1 unit/mL in infusion systems with the infusion fluids 0.9% sodium chloride using polyvinyl chloride infusion bags." +what is medical marijuana,Information,"Some states have approved ""medical marijuana"" to ease symptoms of various health problems. The U.S. Food and Drug Administration (FDA) has not approved the marijuana plant as a medicine. However, there have been scientific studies of cannabinoids, the chemicals in marijuana. This has led to two FDA-approved medicines. They contain THC, the active ingredient in marijuana. They treat nausea caused by chemotherapy and increase appetite in patients who have severe weight loss from HIV/AIDS. Scientists are doing more research with marijuana and its ingredients to treat many diseases and conditions." +"clonazepam "".25mg"" lowest dosage?",Dose,"Klonopin Wafers (clonazepam orally disintegrating tablets) are white, round and debossed with the tablet strength � 0.125 mg debossed 1/8 �" +levaquin treat uti?,Indication,... Complicated Urinary Tract Infections: ... Acute Pyelonephritis: ... Uncomplicated Urinary Tract Infections +"vitamin d 25, totalhow much to takea day",Dose,"Currently, there�s scientific debate about how much vitamin D people need each day. The Institute of Medicine, in a long-awaited report released on November 30, 2010 recommends tripling the daily vitamin D intake for children and adults in the U.S. and Canada, to 600 IU per day. (7) The report also recognized the safety of vitamin D by increasing the upper limit from 2,000 to 4,000 IU per day, and acknowledged that even at 4,000 IU per day, there was no good evidence of harm. The new guidelines, however, are overly conservative about the recommended intake, and they do not give enough weight to some of the latest science on vitamin D and health. For bone health and chronic disease prevention, many people are likely to need more vitamin D than even these new government guidelines recommend." +sickness in humans caused formaldehyde on toys from china?,Side effects,"The Uphill Battle to Better Regulate Formaldehyde ... Safety advocates say that tighter restrictions ... are necessary, particularly for products coming from China, where items as varied as toys and Christmas lights have been found to violate American safety standards." +is cyclobenzaprine a benzodiazepine?,Information,"Cyclobenzaprine is in a class of medications called skeletal muscle relaxants. It works by acting in the brain and nervous system to allow the muscles to relax. �............ Benzodiazepines (sometimes called ""benzos"") work to calm or sedate a person, by raising the level of the inhibitory neurotransmitter GABA in the brain. Common benzodiazepines include diazepam (Valium), alprazolam (Xanax), and clonazepam (Klonopin), among others." +what does vitamin d3 do,Action,"Vitamin D helps your body absorb calcium. Calcium is one of the main building blocks of bone. A lack of vitamin D can lead to bone diseases such as osteoporosis or rickets. Vitamin D also has a role in your nerve, muscle, and immune systems." +what drugs contain in estrone injection,Ingredient,"Estrone, sold under the brand names Estragyn, Kestrin, and Theelin among many others, is an estrogen medication and naturally occurring steroid hormone which has been used in menopausal hormone therapy and for other indications.[5][8][9][10][1][2] It has been available as an aqueous suspension or oil solution that is given by injection into muscle and as a vaginal cream that is applied inside of the vagina.[1][2][3][4] It can also be taken by mouth in the form of estrone sulfate, as in estropipate (piperazine estrone sulfate; Ogen) and conjugated estrogens (Premarin).[11][2][5]" +can i eat after taking rapaflo?,Usage,The recommended dose is 8 mg orally once daily with a meal. +how much levothyroxine is needed to treat hashimotos,Dose,"If Hashimoto's disease causes thyroid hormone deficiency, you may need replacement therapy with thyroid hormone. This usually involves daily use of the synthetic thyroid hormone levothyroxine (Levoxyl, Synthroid, others). ... Treatment with levothyroxine is usually lifelong, but because the dosage you need may change, your doctor is likely to check your TSH level about every 12 months." diff --git a/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_PROMPT_TEMPLATE_HERE/prompt_template.json b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_PROMPT_TEMPLATE_HERE/prompt_template.json new file mode 100644 index 000000000..c22605a33 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/PUT_YOUR_PROMPT_TEMPLATE_HERE/prompt_template.json @@ -0,0 +1,8 @@ +{ + "template": [ + { + "role": "user", + "content": "List the benefits and side effects of the drug in the following consumer health question: {{?question}}." + } + ] +} \ No newline at end of file diff --git a/tutorials/ai-core-genaihub-evaluation/ai-core-genaihub-evaluation.md b/tutorials/ai-core-genaihub-evaluation/ai-core-genaihub-evaluation.md new file mode 100644 index 000000000..546d84d15 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/ai-core-genaihub-evaluation.md @@ -0,0 +1,2520 @@ +--- +parser: v2 +auto_validation: true +time: 45 +primary_tag: software-product>sap-business-technology-platform +tags: [ tutorial>beginner, topic>artificial-intelligence, topic>machine-learning, software-product>sap-business-technology-platform ] +author_name: Smita Naik +author_profile: https://github.com/I321506 +--- + +# Using Evaluation Service available in SAP AI Core + This tutorial demonstrates how to use SAP AI Core Custom Evaluation to benchmark Large Language Models (LLMs) using two different approaches **Prompt Registry** and **Orchestration Registry**. It guides you through dataset preparation, environment setup, configuration creation, execution, and result analysis in a unified and simplified workflow. + +It extends the Quick Start tutorial and is intended for Application Developers and Data Scientists who already know the basics of GenAI workflows in SAP AI Core. + +## You will learn +- How to prepare and organize datasets for evaluation. +- How to choose between **Prompt Registry** and **Orchestration Registry** approaches. +- How to configure and run evaluations in SAP AI Core. +- How to analyze and interpret aggregated evaluation results. + +## Prerequisites + +- Setup Environment: +Ensure your instance and AI Core credentials are properly configured according to the steps provided in the initial tutorial +- Orchestration Deployment: +Ensure at least one orchestration deployment is ready to be consumed during this process. +Refer to [this tutorial understand the basic consumption of GenAI models using orchestration.](https://developers.sap.com/tutorials/ai-core-orchestration-consumption.html) +- Basic Knowledge: Familiarity with the orchestration workflow is recommended +- Install Dependencies: Install the required Python packages using the requirements.txt file provided. +Download [requirements.txt](img/requirements.txt) +💡 Right-click the link above and choose **"Save link as..."** to download it directly. + +**Below are the Steps to Run a GenAI Evaluation in SAP AI Core** + +## Pre-Read + +The structure of the input data should be as follows: + +``` +Root +├── PUT_YOUR_PROMPT_TEMPLATE_HERE +| ├── prompt_template.json +│ +├── PUT_YOUR_DATASET_HERE +│ ├── medicalqna_dataset.csv +| +└── PUT_YOUR_CUSTOM_METRIC_HERE + ├── custom-llm-metric.json + ├── custom-llm-metric.jsonl +``` + +**Dataset and Configuration**: +To run this evaluation, All required input files must be placed inside the folder structure provided in the repository: + +You can download or clone the complete folder from the link below and place your files inside the respective folders [Download / Open Full Folder Structure](https://github.com/SAP-samples/aicore-genai-samples/blob/main/genai-sample-apps/prompt-evaluation) + + 1. **Prompt Template Configuration (`PUT_YOUR_PROMPT_TEMPLATE_HERE`)** + * Place one or more prompt template configurations as JSON files in this folder. + 2. **Test Dataset (`PUT_YOUR_DATASET_HERE`)** + * The test dataset should be a CSV, JSON, or JSONL file containing prompt variables, ground truth references, and other data required for evaluation. + 3. **Custom Metrics (`PUT_YOUR_CUSTOM_METRIC_HERE`)** + * (Optional) You can provide custom metric definitions in a single JSON or JSONL file. For JSONL, each line should be a JSON object defining one metric. For JSON, it should be an array of metric-definition objects. + +### Environment Variables Setup + +[OPTION BEGIN [SAP AI Launchpad]] + +- Navigate to your SAP AI Core Launchpad. + +- In the Workspaces section, click on "Add" to create a new workspace. + - A workspace in SAP AI Core is a logical container that holds your resources (like models and pipelines) and provides the isolation needed for your projects. + +- When prompted, enter your AI Core credentials (such as Client ID, Client Secret, and Base URL). + - Note: If you're unsure about where to find these credentials, refer to this [guide](https://developers.sap.com/tutorials/ai-core-generative-ai.html#1c4f36d7-f345-4822-be00-c15f133ff7d8). + +- Once the workspace is successfully created, select your desired Resource Group to begin the evaluation process. + +Refer to the screenshot below for guidance: +![img](img/image_34.png) + +[OPTION END] + +[OPTION BEGIN [Python]] + +- Open **Visual Studio Code or Jupyter Notebook**. Create a new file with the .ipynb extension (e.g., custom_evaluation.ipynb). +- Create a **.env** file in the root directory of your project. +- Add your **AI Core** and **AWS credentials** as shown below. + +```env +# AICORE CREDENTIALS +AICORE_CLIENT_ID= +AICORE_CLIENT_SECRET= +AICORE_AUTH_URL= +AICORE_BASE_URL= +AICORE_RESOURCE_GROUP= + +# AWS CREDENTIALS +AWS_ACCESS_KEY= +AWS_BUCKET_ID= +AWS_REGION= +AWS_SECRET_ACCESS_KEY= + +# ORCHESTRATION DEPLOYMENT URL +DEPLOYMENT_URL= +``` + +**Note:** Replace placeholders (e.g., CLIENT_ID, CLIENT_SECRET, etc) with your actual environment credentials. + +Refer to the below screenshot for clarity: +![img](img/image_1.png) + +#### Install Dependencies + +Install the required packages using the [requirements.txt](img/requirements.txt) file you downloaded in the Prerequisites section. +```bash +pip install -r requirements.txt +``` +#### Connect to AI Core Instance + +Once the environment variables are set and dependencies are installed, run the following code to connect to your instance: + +```PYTHON +# Loading the credentials from the env file +from gen_ai_hub.proxy.gen_ai_hub_proxy import GenAIHubProxyClient +from dotenv import load_dotenv +import os + +load_dotenv(override=True) + +# Fetching environment variables +AICORE_BASE_URL = os.getenv("AICORE_BASE_URL") +AICORE_RESOURCE_GROUP = os.getenv("AICORE_RESOURCE_GROUP") +AICORE_AUTH_URL = os.getenv("AICORE_AUTH_URL") +AICORE_CLIENT_ID = os.getenv("AICORE_CLIENT_ID") +AICORE_CLIENT_SECRET = os.getenv("AICORE_CLIENT_SECRET") + +AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") +AWS_BUCKET_ID = os.getenv("AWS_BUCKET_ID") +AWS_REGION = os.getenv("AWS_REGION") +AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") +DEPLOYMENT_URL = os.getenv("DEPLOYMENT_URL") + +# Initializing the GenAIHubProxyClient +client = GenAIHubProxyClient( + base_url=AICORE_BASE_URL, + auth_url=AICORE_AUTH_URL, + client_id=AICORE_CLIENT_ID, + client_secret=AICORE_CLIENT_SECRET, + resource_group=AICORE_RESOURCE_GROUP +) +``` + +**NOTE:** +- Ensure the **requirements.txt** installation completes successfully before running the code. +- If you face any issues, recheck your **.env** values and installed packages. + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +- Download the [Bruno_collections](img/AI_Core.json) file + +- please follow the steps in the [Tutorial](https://developers.sap.com/tutorials/ai-core-orchestration-consumption.html) to set up your environment, refer step - **Set Up Your Environment and Configure Access** and proceed till generating the token + +[OPTION END] + +### Registering an Object Store Secret in AI Core + +[OPTION BEGIN [SAP AI Launchpad]] + +- Open the **SAP AI Core Launchpad** and navigate to the **Administration** tab. +- Select the **Object Store** section from the left-hand menu. +- Click on **“Add”** to register a new object store secret. +- Fill in the required bucket details as shown in the screenshot below. + +![img](img/image_33.png) + +In the **Secret** field, use the following structure to provide your AWS credentials: + +```json +{ + "AWS_ACCESS_KEY_ID": "Enter Your value", + "AWS_SECRET_ACCESS_KEY": "Enter Your value" +} +``` + +[OPTION END] + +[OPTION BEGIN [Python]] + +To make your evaluation files available for AI Core orchestration, you need to: + +- Upload them to an object store (e.g., AWS S3). +- Register the object store secret in AI Core. + +#### **Setup Authentication and Headers** + +First, define the authentication headers for AI Core REST API calls. + +```PYTHON +def _get_headers(): + headers = { + "Authorization": client.get_ai_core_token(), + "AI-Resource-Group": AICORE_RESOURCE_GROUP, + "Content-Type": "application/json", + } + return headers +``` + +#### **Register Object Store Secret in AI Core** + +Register your S3 bucket and credentials as a secret. + +```PYTHON +# Register S3 secret with AI Core which will be used an input source +import requests + +def register_oss_secret(): + headers = _get_headers() + + POST_SECRETS_ENDPOINT = '/v2/admin/objectStoreSecrets' + request_url = f"{AICORE_BASE_URL}{POST_SECRETS_ENDPOINT}" + + request_body = { + "name": "genai-data", + "data": { + "AWS_ACCESS_KEY_ID": AWS_ACCESS_KEY, + "AWS_SECRET_ACCESS_KEY": AWS_SECRET_ACCESS_KEY + }, + "type": "S3", + "bucket": AWS_BUCKET_ID, + "endpoint": "s3-eu-central-1.amazonaws.com", + "region": AWS_REGION, + "pathPrefix": "" + } + try: + response = requests.post( + request_url, headers=headers, data=json.dumps(request_body), timeout=120 + ) + result = response.json() + print(result) + return result + except: + logging.error("Error occurred while attempting to create object store secret") + raise + +register_oss_secret() +``` +[OPTION END] + +[OPTION BEGIN [Bruno]] + +Generic secrets securely store AWS S3 credentials required for document access + +• Expand **objectStoreSecrets** under admin and select create a secret request + +Use the below payload to create a secret for AWS S3 with NoAuthentication as authentication type. + +```CODE +{ + "name": "genai-data", + "data": { + "AWS_ACCESS_KEY_ID": "", + "AWS_SECRET_ACCESS_KEY": "", + }, + "type": "S3", + "bucket": "", + "endpoint": "", + "region": "", + "pathPrefix": "" + } +``` +• Ensure that all values in the data dictionary are Base64-encoded as per AWS S3 credential requirements + +![img](img/image-br01.png) + +[OPTION END] + +> ⚠️ **Important Note (Must Read)** +> +> - You must **create an object store secret** with a user defined name (for eg: default) to store **output artifacts** from orchestration runs. This is **mandatory**. +> - For **input artifacts**, you may create additional object store secrets with different names if needed. +> - If a user defined name (for eg: default) is not configured, orchestration runs will **fail** due to missing output target setup. + + +### Upload and Register Dataset + +[OPTION BEGIN [SAP AI Launchpad]] + +After creating the secret, upload your evaluation files to the S3 bucket and register them as an artifact in AI Core. + +#### **Register Uploaded Files as Artifact in AI Core** + +To register your evaluation dataset with SAP AI Core, you need to upload it as an artifact. Follow the instructions below using the **SAP AI Launchpad UI**. + +--- + +- Open the **SAP AI Core Launchpad**. +- Navigate to the **Generative AI/Optimization/Artifacts** section to create dataset artifact. + +![img](img/image_19.png) + +- On the **Artifacts** section, click **add**. + +--- + +- On the **General Information** screen, enter the following: + + - **Select Scenario:** `genai-evaluations` + - **Name:** `genai-eval-test-data` + - **Description:** `Demo artifacts for evaluation flow.` + - **Select Object Store:** `genai-data` + - **Sub-folder path:** `genaiEvaluation/` + + > 💡 Replace `` with your **SAP BTP user ID** or the folder path in your object store where the evaluation files are uploaded. + +- On the **Labels** screen, click **“Add Label”** and provide the following: + + - **Key:** `prompt-evaluation` + - **Value:** `true` + *(Note: The prefix `ext.ai.sap.com/` is automatically pre-filled in the UI.)* + + ![img](img/image_21.png) + +- Review all entered details carefully. +- Click **“Add”** to complete the artifact registration. + +[OPTION END] + +[OPTION BEGIN [Python]] + +After creating the secret, organize your evaluation files into the eval/ folder testdata. Upload them to S3 and register as artifacts in AI Core. + +#### **Upload Files to S3 Bucket** +```python +# Uploads the testdata folder to Object Store for simplified workflow +def upload_folder_to_s3(root_folder, bucket_name, s3_prefix=None): + """ + Look for 'testdata' folder inside root_folder and upload it to S3 under the same s3_prefix. + If no s3_prefix is provided, a static prefix or a UUID will be used. + + The S3 structure will be: + genaiEvaluation/{s3_prefix}/testdata/... + + Args: + root_folder (str): Path containing the 'testdata' subfolder. + bucket_name (str): Name of the S3 bucket. + s3_prefix (str, optional): S3 prefix path. Defaults to None. + + Returns: + str: The path for newly uploaded input artifacts on S3. + + Raises: + FileNotFoundError: If 'testdata' subfolder is missing. + """ + testdata_folder = os.path.join(root_folder, "testdata") + if not os.path.isdir(testdata_folder): + raise FileNotFoundError(f"Missing required folder: testdata in {root_folder}") + + if s3_prefix is None: + # Generate a unique prefix using UUID or static ID + prefix_guid = "" # replace with UUID if needed + s3_prefix = f"genaiEvaluation/{prefix_guid}" + + s3_client = boto3.client( + 's3', + aws_access_key_id=AWS_ACCESS_KEY, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + region_name=AWS_REGION + ) + + # Upload only the testdata folder + full_prefix = f"{s3_prefix}/testdata" + for root, _, files in os.walk(testdata_folder): + for file in files: + local_path = os.path.join(root, file) + relative_path = os.path.relpath(local_path, testdata_folder) + s3_key = f"{full_prefix}/{relative_path}".replace("\\", "/") + print(f"Uploading {local_path} to s3://{bucket_name}/{s3_key}") + s3_client.upload_file(local_path, bucket_name, s3_key) + + return f"ai://genai-data/{s3_prefix}" +``` + ![img](img/image_5.png) + +#### **Register Uploaded Files as Artifact in AI Core** + +```python +# Registering the uploaded files from AWS as artifacts to use inside configuration. +def register_artifact(input_artifact_path): + headers = _get_headers() + + GET_ARTIFACTS_ENDPOINT = '/v2/lm/artifacts' + request_url = f"{AICORE_BASE_URL}{GET_ARTIFACTS_ENDPOINT}" + + request_body = { + "labels": [ + { + "key": "ext.ai.sap.com/prompt-evaluation", + "value": "true" + } + ], + "name": "genai-eval-test-data", + "kind": "other", + "url": input_artifact_path, + "description": "demo artifacts for evaluation flow.", + "scenarioId": "genai-evaluations" + } + try: + response = requests.post( + request_url, headers=headers, data=json.dumps(request_body), timeout=120 + ) + result = response.json() + print(result) + return result['id'] + except: + print("Error occurred while attempting to create an execution") + raise +``` +![img](img/image_6.png) + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +Before registering a dataset artifact in Bruno, you must upload your CSV file to the SAP AI Core object store using the Dataset API. +Bruno cannot upload files directly to S3; therefore, this step is required. + +**Prerequisites** + + - An object store secret must already exist in your resource group.Typically, this is the default secret named **default**. + + - The Dataset API currently supports: + + - S3 object stores only + + - CSV file uploads + +**Upload Your Dataset** + +Use the Dataset API – Upload File request in Bruno: + +```bash +PUT:{{ai_api_url}}/v2/lm/dataset/files/{{secretName}}/{{datasetPath}} +``` + +**Headers** + +```json +Authorization: Bearer {{token}} +AI-Resource-Group: {{resourceGroup}} +Content-Type: text/csv +``` + +**Body** + +Upload your .csv file directly as binary in Bruno’s Body + +Example Path Values: + + - secretName: default + + - datasetPath: testdata/medicalqna_dataset.csv + +![img](img/image_br_dt.png) + +**Note:** + +Save the ai://… URL — you will use this when creating the dataset artifact. + +**Register the Dataset Artifact** + +- Click on **Register artifact** under lm -> artifacts in bruno collection to register the artifact + +```CODE +{ + "name": "aiconfig", + "kind": "dataset", + "url": "ai://default/testdata/medicalqna_dataset.csv", + "scenarioId": "genai-evaluations" +} +``` +![img](img/image-br02.png) + +[OPTION END] + +### Approach Selection – Choose How You Want to Provide Prompts(Read-up) + +In this evaluation workflow, you can provide prompts in two different ways. +Choose only one option based on your requirement. + +Here are your two options: + +| Option | Approach | Description | When to Use | +| ------------ | ------------------------------------------- | --------------------------------------------------------------- | ----------------------------------------------------- | +| **Option 1** | Prompt Template + Model Directly | Prompt stored in Prompt Registry and model referenced directly. | When you want reusable, versioned prompts. | +| **Option 2** | Orchestration Registry (Inline Prompt) | Prompt provided as part of orchestration config. | When prompt is ad-hoc or not reused. | + +After selecting your option: + + - Follow only the steps for that option. + + - Skip the other options. + + - After completing your selected option, go directly to Create Evaluation Configuration. + +### (Option 1) - Providing Prompts via Prompt Template + Model Directly + +✔ Follow this step **ONLY IF** you want to use **Prompt Template**. + +If not, **skip this step and go to Option 2**. + +[OPTION BEGIN [SAP AI Launchpad]] + +A Prompt Template defines: + + - The message roles (system, user, etc.) + + - Variables that get substituted from your dataset (e.g., questions) + + - Optional model configuration (temperature, max tokens, etc.) + +We’ll create a prompt template to guide the model to answer the questions + +**create the Prompt Template** + +- In SAP AI Launchpad, go to the left-hand menu and select Generative AI Hub → Prompt Management. + +- click on Templates → create + +- This is where you can define reusable templates with variables for evaluations. + +![img](img/image_007.png) + +**Define the Prompt** + +In the Message Blocks section: + +- Add a System role message: +```json +{ + "template": [ + { + "role": "user", + "content": "List the benefits and side effects of the drug in the following consumer health question: {{?question}}." + } + ] +} +``` + +**Configure Variables** + +Scroll down to Variable Definitions and add entries for each variable: + +- question + + - Default Value: leave empty or set to en for fallback + +This ensures the placeholders are dynamically substituted during evaluation. + +![img](img/image_008.png) + +**Save the Template** + +Click Save Template (top right): + +- Scenario → genai-evaluations + +- Name → prompt-registry-eval-acc-test + +- Version → 1.0.0 + +Click Save to persist the template. + +**Verify the Template** + +Go to Generative AI Hub → Prompt Management → Templates and confirm: + +- The template appears with the correct name, scenario, and version. + +- Managed By → shows how the template is stored. + +- Versioning is tracked automatically + +![img](img/image_10.png) + +[OPTION END] + +[OPTION BEGIN [Python]] + +```python +import os +import json + +def get_prompt_config_file(folder_path): + """ + Retrieves a list of all JSON file names in the specified folder. + """ + if not os.path.isdir(folder_path): + print(f"The folder path '{folder_path}' does not exist.") + return [] + + json_files = [file for file in os.listdir(folder_path) if file.endswith(".json")] + + if not json_files: + print(f"No JSON files were found in the folder '{folder_path}'.") + return json_files + + +def get_dataset_file_name(folder_path): + """ + Retrieves the name of the first file in the specified folder. + """ + if not os.path.isdir(folder_path): + print(f"The folder path '{folder_path}' does not exist.") + return None + + items_in_folder = os.listdir(folder_path) + + for item in items_in_folder: + item_path = os.path.join(folder_path, item) + if os.path.isfile(item_path): + return item + + print(f"No files were found in the folder '{folder_path}'.") + return None + + +def load_prompt_template(folder_path, file_name): + """ + Loads the contents of a JSON prompt template into a variable. + """ + file_path = os.path.join(folder_path, file_name) + try: + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) # returns Python dict + except Exception as e: + print(f"Error loading prompt template: {e}") + return None + +# --- MAIN EXECUTION --- +PROMPT_FOLDER = "./PUT_YOUR_PROMPT_TEMPLATE_HERE" +DATASET_FOLDER = "./PUT_YOUR_DATASET_HERE" + +PROMPT_CONFIG_FILES = get_prompt_config_file(PROMPT_FOLDER) +DATASET_NAME = get_dataset_file_name(DATASET_FOLDER) + +if PROMPT_CONFIG_FILES and DATASET_NAME: + # Load the first JSON prompt template + PROMPT_TEMPLATE = load_prompt_template(PROMPT_FOLDER, PROMPT_CONFIG_FILES[0]) + print(f"Prompt configs: {PROMPT_CONFIG_FILES}") + print(f"Dataset name: {DATASET_NAME}") + print("Prompt template contents:", PROMPT_TEMPLATE) +else: + print("Missing run or dataset file.") + raise SystemExit("Exiting due to missing run/dataset file.") +``` + +```python +def create_prompt_template(): + headers = _get_headers() + GET_PROMPT_TEMPLATES_ENDPOINT = '/v2/lm/promptTemplates' + request_url = f"{AICORE_BASE_URL}{GET_PROMPT_TEMPLATES_ENDPOINT}" + + request_body = { + "name": "prompt-registry-eval-acc-test", + "version": "1.0.0", + "scenario": "genai-evaluations", + "spec": PROMPT_TEMPLATE + } + try: + response = requests.post( + request_url, headers=headers, data=json.dumps(request_body), timeout=120 + ) + if(response.status_code != 200): + raise + result = response.json() + print(result) + return result['id'] + except: + logging.error("Error occurred while attempting to create a prompt template") + raise + +prompt_template_id = create_prompt_template() +``` +**Note** + +If you wish to use a prompt template that already exists in prompt registry, you can manually set prompt_template_id in the next cell and skip executing this cell + +If you already have an existing template set the ID manually: + +```python +prompt_template_id = "" +``` + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +In Bruno, you can create a prompt template by sending a POST request to the AI Core API: + +**Request: Create Prompt Template** + +**URL:** + +```bash +POST {{api_url}}/v2/lm/promptTemplates +``` + +**Headers:** +``` +Authorization: Bearer {{access_token}} +Content-Type: application/json +``` + +**Body (JSON):** +```json +{ + "name": "prompt-registry-eval-acc-test", + "version": "1.0.0", + "scenario": "genai-evaluations", + "spec": { + "template": [ + { + "role": "user", + "content": "List the benefits and side effects of the drug in the following consumer health question: {{?question}}." + } + ], + "defaults": {}, + "additional_fields": { + "modelParams": { + "temperature": 0.3, + "max_tokens": 100 + }, + "modelGroup": "chat" + } + } +} +``` +![img](img/image_br_pr.png) + +[OPTION END] + +🔑 Tip: Always increment the version (e.g., 1.0.1, 1.0.2) when updating a template. This ensures reproducibility across evaluations. + +### (Option 2) - Providing Prompts via Orchestration Registry (Inline Prompt) + +Follow this step only if you want to **store prompt + model configuration inside Orchestration Registry**. + +**Create Orchestration Registry Configuration** + +[OPTION BEGIN [SAP AI Launchpad]] + +Go to Generative AI Hub → Orchestration → Orchestration Configurations + +- click create + +- In templating add the system prompt + +```json +List the benefits and side effects of the drug in the following consumer health question: {{?question}}. +``` +![img](img/image_ail_or1.png) + +- select the model in model configuration and save + +![img](img/image_ail_or2.png) + +![img](img/image_ail_or3.png) + +[OPTION END] + +[OPTION BEGIN [Python]] + +```python +def create_orchestration_registry_config(): + headers = _get_headers() + CREATE_ORCHESTRATION_REGISTRY = '/v2/registry/v2/orchestrationConfigs' + request_url = f"{AICORE_BASE_URL}{CREATE_ORCHESTRATION_REGISTRY}" + model_name,model_version=selected_models_str.split(":") + request_body = { + "name": "genai-eval-test", + "version": "1.0.0", + "scenario": "genai-evaluations", + "spec": { + "modules": { + "prompt_templating": { + "model": { + "name": model_name, + "version": model_version + }, + "prompt": PROMPT_TEMPLATE + } + } + } + } + try: + response = requests.post( + request_url, headers=headers, data=json.dumps(request_body), timeout=120 + ) + if(response.status_code != 200): + print(response.json()) + raise + result = response.json() + print(result) + return result['id'] + except: + logging.error("Error occurred while attempting to create a orchestration registry id") + raise +orchestration_registry_id = create_orchestration_registry_config() +``` + +![img](img/image_py_or1.png) + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +You can paste this directly into a Bruno .bru file or create a new request inside Bruno. + +**Url:** +```bash +POST {{AICORE_BASE_URL}}/v2/registry/v2/orchestrationConfigs +``` + +**headers:** +``` +{ + Authorization: Bearer {{token}} + AI-Resource-Group: {{resource_group}} + Content-Type: application/json + } +``` + +**body:** +```json +{ + "name": "genai-eval-test", + "version": "1.0.0", + "scenario": "genai-evaluations", + "spec": { + "modules": { + "prompt_templating": { + "model": { + "name": "model_name", + "version": "model_version" + }, + "prompt": { + "template": [ + { + "role": "user", + "content": "List the benefits and side effects of the drug in the following consumer health question: {{?question}}." + } + ], + "defaults": {} + } + } + } + } +} +``` + +![img](img/image_br_or1.png) + +[OPTION END] + +After completing Option 2: + + - Proceed directly to the “Create Evaluation Configuration” section + + +### Understanding Metrics (Pre-Read) + +Metrics determine how your model outputs are evaluated during an evaluation run. They define the scoring logic that SAP AI Core uses to compare models, measure quality, and validate improvements over time. + +In SAP AI Core, you can use: + + - System-defined metrics (ready-made, no setup needed) + + - Custom metrics (your own definitions stored in the metric registry) + +**How Metrics Apply in Each Approach** + +| Approach | How Metrics Apply | +| ------------------------------------- | ----------------------------------------------------------------------------- | +| **Option 1 – Prompt Template** | Metrics score responses generated using the prompt template + selected model. | +| **Option 2 – Orchestration Registry** | Metrics score responses generated through orchestration configuration. | + +Metrics are provided later during **Create Evaluation Configuration**: + +```json +"metrics": "BERT, answer_relevance" +``` + +You can specify one or multiple metrics (comma-separated). + +#### Types of Metrics + +**1. System-defined Metrics** + +These come in two categories: + +**Computed Metrics** + +Score outputs using reference data or validation logic. + +| Metric | Description | Needs Reference? | +| --------------------- | ------------------------------------------ | ---------------- | +| **BERT Score** | Embedding similarity to reference | Yes | +| **BLEU** | N-gram overlap | Yes | +| **ROUGE** | Recall-based overlap | Yes | +| **Exact Match** | Checks if output exactly matches reference | Yes | +| **JSON Schema Match** | Validates output against a schema | Yes | +| **Language Match** | Detects language | No | +| **Content Filter** | Safety filter triggered (input/output) | No | + +**2. LLM-as-a-Judge Metrics** + +These metrics use a judge LLM to score responses based on a rubric. +They are ideal for open-ended tasks with no exact references. + +| Metric | What It Measures | Needs Reference? | +| ------------------------- | --------------------------------- | ---------------- | +| **Instruction Following** | How well the prompt was followed | No | +| **Correctness** | Factual accuracy | Yes | +| **Answer Relevance** | Relevance of the generated answer | No | +| **Conciseness** | Brevity + clarity | No | +| **RAG Groundedness** | Grounding in the provided context | No | +| **RAG Context Relevance** | Usefulness of retrieved context | No | + +--- + +#### Custom Metrics + +Create them when system metrics are insufficient. + +Two ways to define custom metrics: + +**1. Structured metrics (recommended)** + + - Provide task, criteria, rubric, optional examples + + - AI Core constructs the judge prompt + +**2. Free-form metrics** + + - You define prompts and scoring logic manually + +**Custom metric registration:** + +```bash +POST {{ai_api_url}}/v2/lm/evaluationMetrics +``` +Once registered, use them like system metrics: + +```json +"metrics": "my_custom_metric" +``` + +**Example — Prompt Template Approach** + +```json +"metrics": "BERT Score,answer_relevance" +``` + +**Example — Orchestration Registry Approach** + +```json +"metrics": "Pointwise Conciseness" +``` + +The chosen metrics determine: + + - scoring + + - dashboard visualizations + + - aggregated results + + - model ranking logic + +### Providing Metrics for Evaluation + +Metrics must be supplied before creating an Evaluation Configuration. + +[OPTION BEGIN [SAP AI Launchpad]] + +In SAP AI Launchpad, metrics are selected visually during the Evaluation Configuration creation flow. + +You can choose: + + - System-defined metrics + + - Custom metrics (your own definitions stored in the metric registry — cannot be created directly in AI Launchpad; to use them, register them via API/Bruno mentioned in the same step and then select them in the Evaluation Configuration) + +No manual JSON input is needed—the UI provides a selectable list of available metrics. + +1. Go to Generative AI Hub → Optimization. + +2. Click Create to start a new evaluation configuration. + +![img](img/image_25.png) + +Select Test Input / Runs depending on the option you used earlier: + +| Earlier Option Used | What to Select in AIL | +| ------------------------------------- | ----------------------------------------------------------------- | +| **Option 1 – Prompt Template** | Select your **Prompt Template** and choose one or more **Models** | +| **Option 2 – Orchestration Registry** | Select your **Orchestration Registry Config ID** | + +Then: + + - Select your registered dataset artifact + + - Enter the dataset path (example): + testdata/global_customer_queries.csv + + - Set the number of test samples (e.g., 20) + + ![img](img/image_26.png) + +- Click **Next** to go to Metrics selection. + +#### Select Evaluation Metrics + +Choose the metrics you want to evaluate. + +You may choose one or multiple system-defined or custom metrics—examples: + + - BERT Score + + - answer_relevance + + - instruction_following + +![img](img/image_27.png) + +--- + +> 📘 **Helpful Resources**: +> +> - [System-Defined Evaluation Metrics – SAP Documentation](https://help.sap.com/docs/sap-ai-core/generative-ai-hub/system-defined-evaluation-metrics) +> - [Define Your Own Custom Metrics – SAP Guide](https://help.sap.com/docs/sap-ai-core/generative-ai-hub/custom-metrics) +> *(If your evaluation requires domain-specific or advanced scoring logic)* + +> **Note: You may select additional metrics based on your use case.** + +--- + +[OPTION END] + +[OPTION BEGIN [Python]] + +**Metrics Handling in Python Notebook (Automatic Detection & Creation)** + +When running the evaluation through the Python notebook, metric setup is partially automated. +Before the evaluation configuration is created, the script performs the following: + + - Users can manually specify metric IDs + + - Or place custom metric JSON files in CUSTOM_METRIC_FOLDER + + - The notebook loads all custom metric definitions automatically + + - It checks if each metric already exists in AI Core + + - If not found → creates it automatically + + - Prints final list of metric IDs used for evaluation + +This ensures all metrics exist before the evaluation configuration is created. + +```python +import os +import json +import requests + +# --- Load JSON / JSONL files --- +def load_all_metrics(folder_path): + """ + Loads all JSON and JSONL files from a folder into a single list of dicts. + """ + metrics = [] + files = [f for f in os.listdir(folder_path) if f.endswith((".json", ".jsonl"))] + + if not files: + print(f"No JSON/JSONL files found in {folder_path}") + return metrics + + for file_name in files: + file_path = os.path.join(folder_path, file_name) + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read().strip() + try: + data = json.loads(content) + if isinstance(data, list): + metrics.extend(data) + elif isinstance(data, dict): + metrics.append(data) + except json.JSONDecodeError: + # Attempt to parse as JSONL line by line + for line in content.splitlines(): + line = line.strip() + if not line: + continue + try: + metrics.append(json.loads(line)) + except json.JSONDecodeError: + print(f"Skipping invalid JSON line in {file_name}: {line[:50]}...") + except Exception as e: + print(f"Error reading {file_name}: {e}") + return metrics + +# --- Fetch all metrics from SAP AI Core --- +def fetch_all_metrics(): + request_url = f"{AICORE_BASE_URL}/v2/lm/evaluationMetrics" + resp = requests.get(request_url, headers=_get_headers()) + resp.raise_for_status() + return resp.json().get("resources", []) + +# --- Create or fetch a metric --- +def create_or_get_metric(custom_metric, user_metric_id=None): + all_metrics = fetch_all_metrics() + + # 1️⃣ User-supplied ID lookup + if user_metric_id: + for m in all_metrics: + if m.get("id") == user_metric_id: + print(f"✅ Metric already exists by ID: {user_metric_id}") + return user_metric_id + print(f"⚠️ User metric ID {user_metric_id} not found, will only include if valid later") + + # 2️⃣ Check by scenario, name, version + scenario = custom_metric.get("scenario") + name = custom_metric.get("name") + version = custom_metric.get("version") + if not all([scenario, name, version]): + raise ValueError("Metric must include 'scenario', 'name', and 'version'") + + for m in all_metrics: + if (m.get("scenario") == scenario and + m.get("name") == name and + m.get("version") == version): + metric_id = m.get("id") + print(f"✅ Metric already exists: {scenario}/{name} v{version}, ID = {metric_id}") + return metric_id + + # 3️⃣ Create metric if not found + request_url = f"{AICORE_BASE_URL}/v2/lm/evaluationMetrics" + required_fields = ["scenario", "name", "version", "evaluationMethod", "metricType"] + for f in required_fields: + if f not in custom_metric: + raise ValueError(f"❌ Missing required field: {f}") + + resp = requests.post(request_url, headers=_get_headers(), json=custom_metric) + resp.raise_for_status() + metric_id = resp.json().get("id") + print(f"✅ Metric created successfully: {name} v{version}, ID = {metric_id}") + return metric_id + +# --- Main pipeline --- +CUSTOM_METRIC_FOLDER = "./PUT_YOUR_CUSTOM_METRIC_HERE" +user_metric_ids = "" # set by user if needed + +# 1️⃣ Load all metrics from JSON/JSONL +custom_metric_list = load_all_metrics(CUSTOM_METRIC_FOLDER) + +# 2️⃣ Create/fetch metrics from SAP AI Core +metric_ids = [] +for metric in custom_metric_list: + try: + metric_id = create_or_get_metric(metric) + metric_ids.append(metric_id) + except ValueError as e: + print(f"Skipping metric due to error: {e}") + +# 3️⃣ Validate user_metric_ids separately if provided +if user_metric_ids and user_metric_ids.strip(): + all_metrics = fetch_all_metrics() + # Split comma-separated IDs and strip whitespace + for uid in [uid.strip() for uid in user_metric_ids.split(",")]: + if any(m.get("id") == uid for m in all_metrics): + metric_ids.append(uid) + else: + print(f"⚠️ User metric ID {uid} does not exist in AI Core, skipping.") +# 4️⃣ Convert to comma-separated string +custom_metric_ids_str = ",".join(metric_ids) +print("✅ All processed metric IDs:", custom_metric_ids_str) +``` +![img](img/image_py03.png) + +This ensures all required metrics are available before launching the evaluation. + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +Bruno supports two ways of providing metrics: + +**Use System-Defined Metrics** + +You can directly pass system metrics in your configuration: + +Example: + +```json +"metrics": "answer_relevance" +``` + +If you want to register custom metrics, you must call: + +➡️ **Create Custom Metric** + +```bash +POST {{ai_api_url}}/v2/lm/evaluationMetrics +``` +**Body example:** + +```json +{ + "scenario": "genai-evaluations", + "name": "groundedness", + "version": "0.0.1", + "evaluationMethod": "llm-as-a-judge", + "metricType": "evaluation", + "promptType": "structured", + "spec": { + "configuration": { + "modelConfiguration": { + "name": "gpt-4o", + "version": "2024-08-06", + "parameters": [ + { + "key": "temperature", + "value": "0.1" + }, + { + "key": "max_tokens", + "value": "110" + } + ] + }, + "promptConfiguration": { + "evaluationTask": "You will be assessing groundedness, which measures how well the AI-generated response aligns with and is supported by the provided reference.", + "criteria": "Groundedness: The degree of factual and contextual overlap between the response and the reference.", + "ratingRubric": [ + { + "rating": 5, + "rule": "Fully grounded — the response completely aligns with and is fully supported by the reference." + }, + { + "rating": 4, + "rule": "Mostly grounded — the response largely aligns with the reference with only minor deviations." + }, + { + "rating": 3, + "rule": "Somewhat grounded — the response partially aligns, but some details are missing or loosely connected." + }, + { + "rating": 2, + "rule": "Poorly grounded — the response contains minimal overlap with the reference." + }, + { + "rating": 1, + "rule": "Not grounded — the response has no meaningful overlap with the reference." + } + ], + "includeProperties": ["reference","response"] + } + } + } +} + +``` + +You will receive: + +```json +"id": "" +``` + +This metric ID can be directly passed into the evaluation configuration. + +[OPTION END] + +### Define and Create Evaluation Configurations + +[OPTION BEGIN [SAP AI Launchpad]] + +Once your dataset artifact is registered and you have completed Option 1 (Prompt Template) or Option 2 (Orchestration Registry), the next step is to create an Evaluation Configuration. + +An Evaluation Configuration tells SAP AI Core: + + - which dataset to evaluate + + - which prompt/model or orchestration config to use + + - which metrics to compute + + - which orchestration deployment endpoint to call + + - how many repetitions to run + + - which test dataset file to load + +This configuration becomes the blueprint for your evaluation execution. + +**Steps to Create Evaluation Configuration** + +In Additional Configuration + +- Set **Number of Repetitions** to `1`. +- Choose an existing deployment for **Orchestration Endpoint**. + + ![img](img/image_29.png) +--- + +#### Final Review & Start + +- Review all the details on the summary page. +- Once confirmed, click **Create** to start the evaluation job. + +![img](img/image_40.png) + +> ✅ You have now successfully configured and triggered a Generative AI Evaluation. + +[OPTION END] + +[OPTION BEGIN [Python]] + +When using the Python notebook, the evaluation configuration is created automatically based on your selections. +Before creating the configuration, the notebook will: + + - Load the dataset artifact ID + + - Resolve metric IDs (system + custom) + + - Load prompt template or orchestration registry IDs + + - Validate all required parameters + +**Choose Configuration Mode (Option 1 or Option 2)** + +The notebook provides a simple UI with two checkboxes: + +**Option 1 – Prompt Template + Models** + +**Option 2 – Orchestration Registry** + +You must select only one. + +The notebook ensures mutual exclusivity and stores your selection in the variable: + +```python +from ipywidgets import Checkbox, VBox, HBox, Output, Label, Layout +from IPython.display import display +import textwrap + +# --- Selection state --- +approach = None +suppress_update = False + +# --- Define options --- +flag_options = [ + "prompt_registry", + "orchestration_registry" +] + +# --- Output widget to show current selection --- + +output = Output(layout=Layout(border="1px solid black", height="70px", overflow="auto", width="900px")) + + +# --- Handler for checkbox changes --- +def on_flag_change(change): + global approach, suppress_update + if suppress_update: + return + + if change["new"]: # A checkbox was checked + suppress_update = True + # Uncheck all other checkboxes + for cb in checkboxes: + if cb.description != change["owner"].description: + cb.value = False + suppress_update = False + approach = change["owner"].description + else: + # Only clear if the unchecked one was the currently selected + if approach == change["owner"].description: + approach = None + + # Update display once per action + with output: + output.clear_output(wait=True) + msg = f"Selected approach: {approach or 'None'}" + wrapped = textwrap.fill(msg, width=60) + output.append_stdout(wrapped + "\n") + +# --- Create checkboxes --- +checkboxes = [ + Checkbox(value=False, description=option, layout=Layout(width="250px")) + for option in flag_options +] + +# --- Attach event handler --- +for cb in checkboxes: + cb.observe(on_flag_change, names="value") + +# --- Display UI --- +header = Label( + value="Please select the configuration mode:", + layout=Layout(margin="10px 0px 10px 0px") +) +ui = VBox([header, HBox(checkboxes), output]) +display(ui) +``` + +This value determines which fields are passed later: + + - If approach == "prompt_registry" → notebook passes promptTemplate + models + + - If approach == "orchestration_registry" → notebook passes orchestrationRegistryIds + +#### Create Configuration Body + +The notebook builds the configuration using the required SAP AI Core fields: + + - scenarioId + + - executableId + + - dataset artifact binding + + - selected metrics + + - test dataset details + + - repetitions + + - orchestration deployment URL + + - and Option 1 or Option 2 fields, depending on the chosen approach. + +The following function dynamically creates the configuration body for AI Core. + +```python +# creating an AICORE Configuration. +import requests + +request_body = { + "name": "genai-eval-conf", + "scenarioId": "genai-evaluations", + "executableId": "genai-evaluations-simplified", + "inputArtifactBindings": [ + { + "key": "datasetFolder", + "artifactId": artifact_id + } + ], + "parameterBindings": [ + { + "key": "repetitions", + "value": repetitions + }, + { + "key": "orchestrationDeploymentURL", + "value": orchestration_deployment_url + }, + { + "key": "metrics", + "value": metrics_list + }, + { + "key": "testDataset", + "value": test_datasets + }, + { + "key": "promptTemplate", + "value": prompt_template_id if approach == "prompt_registry" else "" + }, + { + "key": "models", + "value": models_list if approach == "prompt_registry" else "" + }, + { + "key": "orchestrationRegistryIds", + "value": orchestration_registry_id if approach == "orchestration_registry" else "" + } + ] +} + +def create_aicore_configuration(): + headers = _get_headers() + GET_CONFIGURATIONS_ENDPOINT = '/v2/lm/configurations' + request_url = f"{AICORE_BASE_URL}{GET_CONFIGURATIONS_ENDPOINT}" + try: + response = requests.post( + request_url, headers=headers, data=json.dumps(request_body), timeout=120 + ) + print(response) + if(response.status_code != 201): + raise + result = response.json() + print(result) + return result['id'] + except: + logging.error("Error occurred while attempting to create a Configuration") + raise + +configuration_id = create_aicore_configuration() +``` + +**Sample parameter setup:** + +```python +import json +test_data_path = f"testdata/{DATASET_NAME}" # specify the test data path here. For the full folder just specifying testdata will work +test_datasets = json.dumps({'path': test_data_path, 'type': 'csv'}) +metrics_list = ",".join([selected_metrics_str,custom_metric_ids_str]) +models_list = selected_models_str +print(f"Selected metrics: {metrics_list}") +print(f"Selected models: {models_list}") +orchestration_deployment_url = "" +repetitions = "1" +``` + +You will receive a configuration ID, which is required for the next step (Execution). + +![img](img/image_py_con.png) + +SAP AI Core returns a configuration ID, which is used to trigger the evaluation execution. + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +When creating an Evaluation Configuration through Bruno, you call: + +```bash +POST {{api_url}}/v2/lm/configurations +``` +Instead, you choose between: + +**Option 1 — Prompt Template + Models** + +**Option 2 — Orchestration Registry** + +based on which fields you include in your request body. + +| Option Selected | Fields You Must Pass | +| ------------------------------------- | -------------------------- | +| **Option 1 – Prompt Template** | `promptTemplate`, `models` | +| **Option 2 – Orchestration Registry** | `orchestrationRegistryIds` | + +All other fields (metrics, testDataset, repetitions, orchestrationDeploymentURL) remain the same across both options. + +Below are the sample request bodies for each option. + +#### Option 1 — Using Prompt Template + Models + +```json +{ + "name": "genai-eval-conf", + "scenarioId": "genai-evaluations", + "executableId": "genai-evaluations-simplified", + "inputArtifactBindings": [ + { + "key": "datasetFolder", + "artifactId": "{{artifactId}}" + } + ], + "parameterBindings": [ + { + "key": "repetitions", + "value": "1" + }, + { + "key": "orchestrationDeploymentURL", + "value": "{{deployment_url}}" + }, + { + "key": "metrics", + "value": "language_match" + }, + { + "key": "testDataset", + "value": "{\"path\": \"testdata/{{dataset_file}}\", \"type\": \"csv\"}" + }, + { + "key": "promptTemplate", + "value": "{{prompt_template_id}}" + }, + { + "key": "models", + "value": "{{model_name}}:{{model_version}}" + } + ] +} +``` +![img](img/image-br03.png) + +#### Option 2 — Using Orchestration Registry + +```json +{ + "name": "genai-eval-conf", + "scenarioId": "genai-evaluations", + "executableId": "genai-evaluations-simplified", + "inputArtifactBindings": [ + { + "key": "datasetFolder", + "artifactId": "{{artifactId}}" + } + ], + "parameterBindings": [ + { + "key": "repetitions", + "value": "1" + }, + { + "key": "orchestrationDeploymentURL", + "value": "{{deployment_url}}" + }, + { + "key": "metrics", + "value": "language_match" + }, + { + "key": "testDataset", + "value": "{\"path\": \"testdata/{{dataset_file}}\", \"type\": \"csv\"}" + }, + { + "key": "orchestrationRegistryIds", + "value": "{{orchestration_registry_id}}" + } + ] +} +``` + +![img](img/image-br06.png) + +[OPTION END] + +### Create and Run Evaluation Execution + +After creating the Evaluation Configuration, the next step is to execute it. +Execution triggers the evaluation workflow, which: + + - Reads the test dataset + + - Generates submissions to the orchestration service + + - Collects model outputs + + - Computes all selected metrics + + - Produces aggregate and raw evaluation results + +The process is identical for SAP AI Launchpad, Python, and Bruno, with only the invocation method differing. + +[OPTION BEGIN [SAP AI Launchpad]] + +- Once the evaluation configuration is created, the system automatically triggers an evaluation execution. + +- Follow these steps to monitor its progress and verify completion: + + - Navigate to **ML Operations** in the SAP AI Core Launchpad. + + - In the sidebar, click **Executions**. + + ![img](img/image_41.png) + + - Locate the most recent execution triggered by your evaluation configuration. You can use the timestamp or configuration name to identify it. + + - Click on the execution entry to open its details. The Current Status will update as the process runs. + + ![img](img/image_31.png) + +- Once the Target Status reaches **COMPLETED** , your evaluation has successfully finished. + +> [For More information](https://help.sap.com/docs/sap-ai-core/generative-ai-hub/create-evaluation) + +Track Execution Status + +The execution page will show: + + - Unknown + + - Pending + + - Running + + - Completed + +Once completed, you can navigate to: + + - Outputs → Tracking Metrics (aggregate results) + + - Output Artifacts (raw results stored in the SQLite DB) + +[OPTION END] + +[OPTION BEGIN [Python]] + +Once the configuration is ready, the next step is to trigger an execution. +An execution is a single evaluation run based on the configuration you defined. + +**Create Execution** + +The following function starts the evaluation in SAP AI Core using the configuration ID: + +```python +# Trigger an execution with the created configuration + +import requests +def create_execution(): + headers = _get_headers() + GET_EXECUTIONS_ENDPOINT = '/v2/lm/executions' + request_url = f"{AICORE_BASE_URL}{GET_EXECUTIONS_ENDPOINT}" + request_body = {"configurationId" : configuration_id} + try: + response = requests.post( + request_url, headers=headers, data=json.dumps(request_body), timeout=120 + ) + print("response received is ", response) + result = response.json() + print(result) + return result['id'] + except: + logging.error("Error occurred while attempting to create an execution") + raise + + +execution_id = create_execution() +``` +![img](img/image_44.png) + +#### Monitor Execution Status + +The execution progresses through states: + +UNKNOWN → PENDING → RUNNING → COMPLETED + +```python +# get execution status +import requests +def get_execution_status(execution_id): + headers = _get_headers() + LOG_EXECUTIONS_ENDPOINT = f'/v2/lm/executions/{execution_id}' + request_url = f"{AICORE_BASE_URL}{LOG_EXECUTIONS_ENDPOINT}" + try: + response = requests.get( + request_url, headers=headers, timeout=120 + ) + print("response received is ", response) + result = response.json() + return result + except: + logging.error("Error occurred while attempting to get execution status") + raise + +get_execution_status(execution_id) +``` + +#### Automatic Polling + +To continuously monitor until the evaluation finishes: + +```python +# Polling the execution status until it is COMPLETED or DEAD or timeout occurs +def poll_execution_status(execution_id, timeout_minutes=1800, poll_interval=30): + start_time = time.time() + while True: + result = get_execution_status(execution_id) + print(f"Execution Status: {result.get('status')}") + if result.get("status") == "COMPLETED": + print(f"Execution completed successfully in {time.time() - start_time} seconds, proceed to fetch results.") + break + if result.get("status") == "DEAD": + print(f"Execution failed with status DEAD in {time.time() - start_time} seconds. Check the logs for more details.") + break + if time.time() - start_time > timeout_minutes * 60: + raise TimeoutError(f"Execution status polling timed out after {timeout_minutes} minutes.") + time.sleep(poll_interval) + +``` + +![img](img/image_45.png) + +✅ Once the execution status shows COMPLETED, the evaluation results are available and can be analyzed in the next step. + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +After creating the configuration, the next step is to trigger the evaluation workload by creating an AI Core execution. + +**Create an Execution with the Created Configuration** + +- Click on create execution under executions, pass the configuration id created in previous step + +![img](img/image-br04.png) + +- The status field progresses through different states over time: +UNKNOWN → PENDING → RUNNING → COMPLETED. + +**Get Execution Status** + +check the status of created execution by passing the execution ID, The Current Status will update as the process runs. please refer the below image + +![img](img/image-br05.png) + +[OPTION END] + +### View and Analyze Evaluation Results + +Once the evaluation execution is complete, SAP AI Core generates both aggregated metrics and detailed instance-level results. +These results help compare model performance, understand quality metrics, and debug issues. + +[OPTION BEGIN [SAP AI Launchpad]] + +Once the evaluation workflow execution is completed, this step retrieves the aggregated evaluation metrics from the SAP AI Core service by specifying the run name. + +1. Go to Evaluations → Executions + +2. Select your execution + +3. Open the Metrics tab to view: + + - average latency + + - token usage + + - metric scores + +4. Open the Artifacts tab to download: + + - the complete result folder + + - the SQLite DB for deeper analysis + +This is the easiest way to visually inspect evaluation outcomes and compare multiple model runs. + +![img](img/image_46.png) + +[OPTION END] + +[OPTION BEGIN [Python]] + +The notebook includes utility scripts to retrieve aggregated metrics, download detailed artifacts, and inspect SQLite results.This returns all metric values per evaluated run, which your notebook then: + + - Converts into a DataFrame + + - Creates a pivot table + + - Prepares for ranking and scoring + +**Retrieve Aggregate Metrics (Tracking API)** + +Aggregated metrics summarize performance across all test samples. +To fetch them using execution ID: + +```python +# Get aggregate metrics using execution id +import pandas as pd +from IPython.display import HTML + +def get_model_from_run(run): + for tag in run.get("tags", []): + if tag.get("name") == "evaluation.ai.sap.com/model": + return tag.get("value") + +def aggregate_metrics_by_model(runs_list): + transformed_data = [] + for run in runs_list: + model = get_model_from_run(run) + for metric in run["metrics"]: + output_json = { + "model": model, + "metrics_name": metric.get("name"), + "metric_value": metric.get("value") + } + transformed_data.append(output_json) + return transformed_data + + +def create_metrics_pivot_table(transformed_data): + """ + Creates a pivot table where rows are models and columns are metrics. + + Args: + transformed_data: List of dictionaries with 'model', 'metrics_name', 'metric_value' + + Returns: + DataFrame with models as rows and metrics as columns + """ + # Convert list of dictionaries to DataFrame + df = pd.DataFrame(transformed_data) + + # Create pivot table + pivot_table = df.pivot_table( + index='model', + columns='metrics_name', + values='metric_value', + aggfunc='first' # Use 'first' to get the single value, or 'mean' if there are duplicates + ) + + return pivot_table + +transformed_data = aggregate_metrics_by_model(runs_data['resources']) +metrics_pivot = create_metrics_pivot_table(transformed_data) + +HTML(metrics_pivot.to_html()) +``` +![img](img/image_47.png) + +You can also retrieve using run name: + +```bash +{base_url}/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/run-name={run_name} +``` + +**Download Raw Results (Output Artifact)** + +All detailed evaluation outputs are stored as an output artifact in your object store. To download all output files programmatically: + +```python +# download the result artifacts from Object store. +import boto3 + +def download_all_objects(prefix, destination_folder): + """ + Recursively download all objects from an S3 bucket starting with a specific prefix. + + :param bucket_name: Name of the S3 bucket. + :param prefix: Prefix to filter objects in the bucket. + :param destination_folder: Local folder to save the downloaded files. + """ + s3_client = boto3.client( + 's3', + aws_access_key_id=AWS_ACCESS_KEY, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + region_name=AWS_REGION + ) + + # Ensure the destination folder exists + if not os.path.exists(destination_folder): + os.makedirs(destination_folder) + + # Paginate through objects + paginator = s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=AWS_BUCKET_ID, Prefix=prefix) + + for page in pages: + if 'Contents' in page: + for obj in page['Contents']: + key = obj['Key'] + local_file_path = os.path.join(destination_folder, os.path.relpath(key, prefix)) + + # Ensure the local directory structure exists + local_directory = os.path.dirname(local_file_path) + if not os.path.exists(local_directory): + os.makedirs(local_directory) + + # Download the object + print(f"Downloading {key} to {local_file_path}") + s3_client.download_file(AWS_BUCKET_ID, key, local_file_path) + + +# Download the evaluation results from the object store. Look at execution status under "outputArtifacts" key to see the 'url' +# which shows the data path of where your output results are stored +EXECUTION_ID = execution_id +sqlite_db_prefix = f'{EXECUTION_ID}/evaluation_result/' # change the prefix based on where your output artifact is stored in the bucket. +destination_folder = 'results-new' + +download_all_objects(sqlite_db_prefix, destination_folder) +``` + +![img](img/image_48.png) + +**View Detailed Results (SQLite DB)** + +The evaluation stores detailed instance-level results in results.db. + +Example: Reading SQLite tables: + +```python +# viewing the results from sqlite db in tabular format.. +import sqlite3 +import pandas as pd +from IPython.display import display, HTML + +# Path to your SQLite database file +db_file = 'results-new/results.db' + +connection = sqlite3.connect(db_file) + +# Specify the table names you want to display +table_names = ['run','configuration', 'submission', 'submission_result', 'evaluation_result'] + +# Create the CSS and HTML container +html_content = """ + +
+""" + +for table_name in table_names: + query = f"SELECT * FROM {table_name};" + df = pd.read_sql_query(query, connection) + # If you want to see all the rows across all tables, remove/comment the next line + df = df.head(10) # Limiting the number of rows displayed + table_html = df.to_html(classes='table-container', index=False) + html_content += f""" +
+

Table: {table_name}

+ {table_html} +
+ """ + +html_content += "
" + +display(HTML(html_content)) + +# Close the connection +connection.close() +``` + +**Process and Rank Models (Optional Python Helpers)** + +The notebook includes post-processing utilities that: + + - normalize numeric metrics + + - process boolean and categorical metrics + + - compute weighted scores + + - generate a final ranking to identify the best model + +```python +import pandas as pd +from IPython.display import HTML + +# Scoring logic depends on "scoring_type" +# "weight" represents the relative weight of this metric to all SELECTED metrics +METRICS_SCORING_TYPE_MAPPING = { + "Content Filter on Input": { + "scoring_type": "bool-false", # False is good + "weight": 1 + }, + "Content Filter on Output": { + "scoring_type": "bool-false", # False is good + "weight": 1 + }, + "Pointwise Instruction Following": { + "scoring_type": "num_1_to_5", + "weight": 1 + }, + "Pointwise Answer Relevance": { + "scoring_type": "num_1_to_5", + "weight": 1 + }, + "Pointwise Conciseness": { + "scoring_type": "num_1_to_5", + "weight": 1 + }, + "Pointwise Correctness": { + "scoring_type": "num_1_to_5", + "weight": 1 + }, + "BLEU": { + "scoring_type": "num_0_to_1", + "weight": 1 + }, + "ROUGE": { + "scoring_type": "num_0_to_1", + "weight": 1 + }, + "BERT Score": { + "scoring_type": "F1/Precision/Recall num_0_to_1", + "weight": 1 + } +} + +def calculate_bool_metric_score(pivot_df, metric_base_name, true_is_good): + """ + Calculate scores for boolean metrics based on False/True counts. + + Args: + pivot_df: DataFrame with models as rows and metrics as columns + metric_base_name: Base name of the metric (without /False/count or /True/count) + true_is_good: Boolean indicating if True is considered a good outcome + + Returns: + Series with boolean metric scores per model (scaled to -1 to 1) + """ + false_col = f"{metric_base_name}/False/count" + true_col = f"{metric_base_name}/True/count" + + false_values = pivot_df[false_col] if false_col in pivot_df.columns else 0 + true_values = pivot_df[true_col] if true_col in pivot_df.columns else 0 + total_values = true_values + false_values + + score = ((false_values * 1) + (true_values * -1)) / total_values + + if true_is_good: + score = 0 - score + + return score + +def calculate_numeric_metric_score(pivot_df, metric_base_name, range_min=0, range_max=1): + """ + Calculate scores for numeric metrics with /mean + The mean is normalized to a score between -1 and 1 using the provided range. + + Args: + pivot_df: DataFrame with models as rows and metrics as columns + metric_base_name: Base name of the metric (without suffixes) + range_min: Minimum possible value of the metric + range_max: Maximum possible value of the metric + + Returns: + Series with numeric metric scores per model (scaled to -1 to 1) + """ + mean_col = f"{metric_base_name}/mean" + + if mean_col not in pivot_df.columns: + return pd.Series(0.0, index=pivot_df.index) + + mean_values = pivot_df[mean_col] + + # Linear normalization from [range_min, range_max] to [0, 1] + normalized = (mean_values - range_min) / (range_max - range_min) + + # Scale to [-1, 1] + score = (normalized * 2) - 1 + + return score + +def calculate_bert_score(pivot_df, metric_base_name): + """ + Calculate BERT Score by averaging F1, Precision, and Recall scores. + + Args: + pivot_df: DataFrame with models as rows and metrics as columns + metric_base_name: Base name "BERT Score" + + Returns: + Series with BERT scores per model (scaled to -1 to 1) + """ + f1_col = f"{metric_base_name}/F1/mean" + precision_col = f"{metric_base_name}/Precision/mean" + recall_col = f"{metric_base_name}/Recall/mean" + + scores = [] + for col in [f1_col, precision_col, recall_col]: + if col in pivot_df.columns: + scores.append(pivot_df[col]) + + if not scores: + return pd.Series(0.0, index=pivot_df.index) + + # Average the three metrics (already in 0 to 1 range) + avg_score = sum(scores) / len(scores) + + # Scale to [-1, 1] + score = (avg_score * 2) - 1 + + return score + +def find_unique_metrics_in_pivot(pivot_df): + """ + Identify unique metric base names present in the pivot table. + + Args: + pivot_df: DataFrame with models as rows and metrics as columns + """ + # Extract unique metric names from pivot table columns + unique_metrics = set() + for col in pivot_df.columns: + # Extract base metric name by removing suffixes + base_name = col + for suffix in ['/False/count', '/True/count', '/F1_score/mean','/Precision_score/mean', + '/Recall_score/mean','/mean','/median', '/p90', '/p95', '/stddev']: + if suffix in base_name and "BERT Score" not in base_name: + base_name = base_name.replace(suffix, '') + unique_metrics.add(base_name) + break + if base_name.startswith("BERT Score/"): + base_name = "BERT Score" + unique_metrics.add(base_name) + if not unique_metrics: + raise ValueError("No valid metrics found in pivot table") + return unique_metrics + + +def rank_models(pivot_df, unique_metrics=None): + """ + Rank models based on metrics present in the pivot table. + + Args: + pivot_df: DataFrame with models as rows (index) and metrics as columns + + Returns: + DataFrame with model rankings and scores + """ + # Calculate total weight for metrics present in pivot table + total_weight = sum(METRICS_SCORING_TYPE_MAPPING[m]["weight"] for m in unique_metrics) + + # Initialize total score + total_scores = pd.Series(0.0, index=pivot_df.index) + + # Process each metric found in the pivot table + for metric_name in unique_metrics: + config = METRICS_SCORING_TYPE_MAPPING[metric_name] + scoring_type = config["scoring_type"] + weight = config["weight"] / total_weight + + if scoring_type == "bool-false": + # False is good (True is bad) + metric_score = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=False) + total_scores += metric_score * weight + + elif scoring_type == "bool-true": + # True is good (False is bad) + metric_score = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=True) + total_scores += metric_score * weight + + elif scoring_type == "num_1_to_5": + metric_score = calculate_numeric_metric_score(pivot_df, metric_name, range_min=1, range_max=5) + total_scores += metric_score * weight + + elif scoring_type == "num_0_to_1": + metric_score = calculate_numeric_metric_score(pivot_df, metric_name, range_min=0, range_max=1) + total_scores += metric_score * weight + + elif scoring_type == "F1/Precision/Recall num_0_to_1": + # BERT Score + metric_score = calculate_bert_score(pivot_df, metric_name) + total_scores += metric_score * weight + + # Create results DataFrame + results_df = pd.DataFrame({ + 'model': pivot_df.index, + 'total_score': total_scores.values + }) + + # Rank models (higher score = better rank) + results_df['rank'] = results_df['total_score'].rank(ascending=False, method='min').astype(int) + results_df = results_df.sort_values('rank') + + return results_df + +def get_detailed_scores(pivot_df, unique_metrics): + """ + Get detailed breakdown of scores per metric for each model. + + Args: + pivot_df: DataFrame with models as rows and metrics as columns + + Returns: + DataFrame with detailed scores per metric + """ + detailed_scores = pd.DataFrame(index=pivot_df.index) + + # Process each metric in the mapping + for metric_name in unique_metrics: + scoring_type = METRICS_SCORING_TYPE_MAPPING[metric_name]["scoring_type"] + + if scoring_type == "bool-false": + detailed_scores[f"{metric_name}_score"] = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=False) + + elif scoring_type == "bool-true": + detailed_scores[f"{metric_name}_score"] = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=True) + + elif scoring_type == "num_1_to_5": + detailed_scores[f"{metric_name}_score"] = calculate_numeric_metric_score(pivot_df, metric_name, range_min=1, range_max=5) + + elif scoring_type == "num_0_to_1": + detailed_scores[f"{metric_name}_score"] = calculate_numeric_metric_score(pivot_df, metric_name, range_min=0, range_max=1) + + elif scoring_type == "F1/Precision/Recall num_0_to_1": + detailed_scores[f"{metric_name}_score"] = calculate_bert_score(pivot_df, metric_name) + + return detailed_scores + +unique_metrics = find_unique_metrics_in_pivot(metrics_pivot) + +# Get detailed scores breakdown +detailed = get_detailed_scores(metrics_pivot, unique_metrics) +display(HTML(detailed.to_html())) + +# Rank models +ranking = rank_models(metrics_pivot, unique_metrics) +display(HTML(ranking.to_html())) +``` +This provides a clear ranking of models based on the metrics you selected during evaluation. + +![img](img/image_py_rk.png) + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +Retrieve Aggregate Metrics + +Send a GET request: + +**GET** +```bash +{{apiurl}}/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/child-of={{execution_id}} +``` +or using dataset run name: + +**GET** +```bash +{{apiurl}}/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/run-name={{run_name}} +``` + +This returns aggregated values for: + + - latency + + - token usage + + - metric scores + + - completion count + +**Download Raw Results** + +1. Open the execution details + +2. Copy the output artifact URL + +3. Download the folder to obtain + + - step-wise results + + - sqlite_combined/results.db + +**Inspect Detailed Results** + +Open the SQLite DB in any client to inspect: + + - submissions + + - completion responses + + - evaluation_results (raw metric scores) + + - aggregation_results + + - custom_logs + +![img](img/image_49.png) + +[OPTION END] + +### Delete Evaluation Artifacts, Configurations & Metrics + +Over time, your workspace may accumulate old configurations, executions, and metrics. +SAP AI Core allows you to safely delete these resources once they are no longer needed. + +This section explains how to delete: + + - Evaluation Executions + + - Evaluation Configurations + + - Custom Metrics (if created) + +⚠️ Important: + +Deletions are permanent and cannot be undone. +System-defined metrics cannot be deleted — only your custom metrics. + +[OPTION BEGIN [SAP AI Launchpad]] + +**Delete Executions** + +1. Go to Evaluations → Executions + +2. Select the execution + +3. Click Delete + +4. Confirm the deletion + +**Delete Evaluation Configurations** + +1. Go to Evaluations → Configurations + +2. Select the configuration you created + +3. Click Delete + +[OPTION END] + +[OPTION BEGIN [Python]] + +**1. Delete an Evaluation Execution** + +```python +#Delete Execution Id +def delete_execution(): + headers = _get_headers() + EXEC_ID = execution_id + GET_EXECUTIONS_ENDPOINT = '/v2/lm/executions/' + request_url = f"{AICORE_BASE_URL}{GET_EXECUTIONS_ENDPOINT}{EXEC_ID}" + try: + response = requests.delete( + request_url, headers=headers, params={"AI-Resource-Group":AICORE_RESOURCE_GROUP}, timeout=120 + ) + print(response) + if(response.status_code != 202): + raise + result = response.json() + print(result) + except: + logging.error("Error occurred while attempting to delete a Configuration") + raise + +delete_execution() +``` +**2. Delete an Evaluation Configuration** + +```python +def delete_configuration(configuration_id): + headers = _get_headers() + endpoint = f"/v2/lm/configurations/{configuration_id}" + url = f"{AICORE_BASE_URL}{endpoint}" + + response = requests.delete(url, headers=headers) + print("Status:", response.status_code) + print(response.text) + +# Example: +delete_configuration(configuration_id) +``` + +**3. Delete a Custom Metric** + +```python +def delete_metric(metric_id): + headers = _get_headers() + endpoint = f"/v2/lm/evaluationMetrics/{metric_id}" + url = f"{AICORE_BASE_URL}{endpoint}" + + response = requests.delete(url, headers=headers) + print("Status:", response.status_code) + print(response.text) + +# Example: +delete_metric(metric_id) +``` + +[OPTION END] + +[OPTION BEGIN [Bruno]] + +**1. Delete Execution** + +**DELETE Request** +```bash +{{apiurl}}/v2/lm/executions/{{execution_id}} +``` +**Headers:** +``` +Authorization: Bearer {{access_token}} +AI-Resource-Group: {{resource_group}} +``` +**2. Delete Configuration** + +```bash +DELETE {{apiurl}}/v2/lm/configurations/{{configuration_id}} +``` + +**3. Delete Custom Metric** + +```bash +DELETE {{apiurl}}/v2/lm/evaluationMetrics/{{metric_id}} +``` + +[OPTION END] diff --git a/tutorials/ai-core-genaihub-evaluation/evaluation_public_simplified_workflow.ipynb b/tutorials/ai-core-genaihub-evaluation/evaluation_public_simplified_workflow.ipynb new file mode 100644 index 000000000..a03d5dbfc --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/evaluation_public_simplified_workflow.ipynb @@ -0,0 +1,2745 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generative AI Custom Evaluation\n", + "This is an example notebook which showcases how a user can use AI Core custom evaluation to benchmark their large language models, evaluate orchestration configuration or prompts for their use case.\n", + "It uses publicly available [MedicationQA dataset](https://langtest.org/docs/pages/benchmarks/medical/medicationqa/) which consists of commonly asked consumer questions about medications. The workload computes industry standard metrics to check the reliability of the response generate by llm.\n", + "
**Note: For detailed instructions please refer to [Readme](./Readme.md)**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Place your Dataset and Run configuration\n", + "Place your files as such:\n", + "1. Place your dataset in the folder `PUT_YOUR_DATASET_HERE`\n", + "2. Place your prompt template configurations in the folder `PUT_YOUR_PROMPT_TEMPLATE_HERE`\n", + "3. Place your custom metric files in folder `PUT_YOUR_CUSTOM_METRIC_HERE`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SetUp (Step 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: generative-ai-hub-sdk==4.4.3 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (4.4.3)\n", + "Requirement already satisfied: python-dotenv==1.0.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (1.0.1)\n", + "Requirement already satisfied: boto3==1.37.4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (1.37.4)\n", + "Requirement already satisfied: pandas==2.2.3 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (2.2.3)\n", + "Requirement already satisfied: json2html==1.3.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (1.3.0)\n", + "Requirement already satisfied: numpy==1.26.4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (1.26.4)\n", + "Requirement already satisfied: ipywidgets==8.1.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from -r requirements.txt (line 7)) (8.1.0)\n", + "Requirement already satisfied: pydantic==2.9.2 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2.9.2)\n", + "Requirement already satisfied: dacite>=1.8.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (1.9.2)\n", + "Requirement already satisfied: ai-core-sdk>=2.5.7 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2.6.2)\n", + "Requirement already satisfied: httpx>=0.27.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (0.28.1)\n", + "Requirement already satisfied: click>=8.1.7 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (8.1.8)\n", + "Requirement already satisfied: openai>=1.56.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2.6.1)\n", + "Requirement already satisfied: overloading==0.5.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (0.5.0)\n", + "Requirement already satisfied: packaging>=23.2 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (25.0)\n", + "Requirement already satisfied: s3transfer<0.12.0,>=0.11.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from boto3==1.37.4->-r requirements.txt (line 3)) (0.11.5)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from boto3==1.37.4->-r requirements.txt (line 3)) (1.0.1)\n", + "Requirement already satisfied: botocore<1.38.0,>=1.37.4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from boto3==1.37.4->-r requirements.txt (line 3)) (1.37.38)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pandas==2.2.3->-r requirements.txt (line 4)) (2025.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pandas==2.2.3->-r requirements.txt (line 4)) (2.9.0.post0)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pandas==2.2.3->-r requirements.txt (line 4)) (2025.2)\n", + "Requirement already satisfied: widgetsnbextension~=4.0.7 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipywidgets==8.1.0->-r requirements.txt (line 7)) (4.0.15)\n", + "Requirement already satisfied: jupyterlab-widgets~=3.0.7 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipywidgets==8.1.0->-r requirements.txt (line 7)) (3.0.16)\n", + "Requirement already satisfied: comm>=0.1.3 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.2.3)\n", + "Requirement already satisfied: traitlets>=4.3.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipywidgets==8.1.0->-r requirements.txt (line 7)) (5.14.3)\n", + "Requirement already satisfied: ipython>=6.1.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipywidgets==8.1.0->-r requirements.txt (line 7)) (8.18.1)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pydantic==2.9.2->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (0.7.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pydantic==2.9.2->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (4.15.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pydantic==2.9.2->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2.23.4)\n", + "Requirement already satisfied: ai-api-client-sdk==2.6.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ai-core-sdk>=2.5.7->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2.6.1)\n", + "Requirement already satisfied: aenum~=3.1 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ai-api-client-sdk==2.6.1->ai-core-sdk>=2.5.7->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (3.1.16)\n", + "Requirement already satisfied: pyhumps~=3.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ai-api-client-sdk==2.6.1->ai-core-sdk>=2.5.7->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (3.8.0)\n", + "Requirement already satisfied: requests<3.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ai-api-client-sdk==2.6.1->ai-core-sdk>=2.5.7->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2.32.5)\n", + "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from botocore<1.38.0,>=1.37.4->boto3==1.37.4->-r requirements.txt (line 3)) (1.26.20)\n", + "Requirement already satisfied: httpcore==1.* in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from httpx>=0.27.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (1.0.9)\n", + "Requirement already satisfied: anyio in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from httpx>=0.27.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (4.11.0)\n", + "Requirement already satisfied: idna in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from httpx>=0.27.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (3.11)\n", + "Requirement already satisfied: certifi in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from httpx>=0.27.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (2025.10.5)\n", + "Requirement already satisfied: h11>=0.16 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from httpcore==1.*->httpx>=0.27.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (0.16.0)\n", + "Requirement already satisfied: matplotlib-inline in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.2.1)\n", + "Requirement already satisfied: decorator in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (5.2.1)\n", + "Requirement already satisfied: jedi>=0.16 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.19.2)\n", + "Requirement already satisfied: pygments>=2.4.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (2.19.2)\n", + "Requirement already satisfied: exceptiongroup in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (1.3.0)\n", + "Requirement already satisfied: pexpect>4.3 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (4.9.0)\n", + "Requirement already satisfied: stack-data in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.6.3)\n", + "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (3.0.52)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from openai>=1.56.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (1.9.0)\n", + "Requirement already satisfied: tqdm>4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from openai>=1.56.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (4.67.1)\n", + "Requirement already satisfied: sniffio in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from openai>=1.56.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (1.3.1)\n", + "Requirement already satisfied: jiter<1,>=0.10.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from openai>=1.56.0->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (0.11.1)\n", + "Requirement already satisfied: six>=1.5 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas==2.2.3->-r requirements.txt (line 4)) (1.17.0)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.4 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.8.5)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.7.0)\n", + "Requirement already satisfied: wcwidth in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.2.14)\n", + "Requirement already satisfied: pure-eval in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (0.2.3)\n", + "Requirement already satisfied: asttokens>=2.1.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (3.0.0)\n", + "Requirement already satisfied: executing>=1.2.0 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from stack-data->ipython>=6.1.0->ipywidgets==8.1.0->-r requirements.txt (line 7)) (2.2.1)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /Users/c5408555/Downloads/notebook-update-v2/venv/lib/python3.9/site-packages (from requests<3.0->ai-api-client-sdk==2.6.1->ai-core-sdk>=2.5.7->generative-ai-hub-sdk==4.4.3->-r requirements.txt (line 1)) (3.4.4)\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 25.3 is available.\n", + "You should consider upgrading via the '/Users/c5408555/Downloads/notebook-update-v2/venv/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m(Deprecated) Installing extensions with the jupyter labextension install command is now deprecated and will be removed in a future major version of JupyterLab.\n", + "\n", + "Users should manage prebuilt extensions with package managers like pip and conda, and extension authors are encouraged to distribute their extensions as prebuilt packages \u001b[0m\n", + "Building jupyterlab assets (production, minimized)\n" + ] + } + ], + "source": [ + "! pip install -r requirements.txt\n", + "! jupyter labextension install @jupyter-widgets/jupyterlab-manager" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load your environment variables\n", + "\n", + "Ensure that your environment variables are set in a `.env` file (see sample.env for an example). If there is a missing field the notebook will prompt you for a value." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the credentials from the env file\n", + "from gen_ai_hub.proxy.gen_ai_hub_proxy import GenAIHubProxyClient\n", + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "load_dotenv(override=True)\n", + "\n", + "\n", + "# Fetching environment variables or prompting the user if missing\n", + "AICORE_BASE_URL = os.getenv(\"AICORE_BASE_URL\") or input(\"AICORE_BASE_URL is missing. Please enter it: \")\n", + "AICORE_RESOURCE_GROUP = os.getenv(\"AICORE_RESOURCE_GROUP\") or input(\"AICORE_RESOURCE_GROUP is missing. Please enter it (default: 'default'): \") or \"default\"\n", + "AICORE_AUTH_URL = os.getenv(\"AICORE_AUTH_URL\") or input(\"AICORE_AUTH_URL is missing. Please enter it: \")\n", + "AICORE_CLIENT_ID = os.getenv(\"AICORE_CLIENT_ID\") or input(\"AICORE_CLIENT_ID is missing. Please enter it: \")\n", + "AICORE_CLIENT_SECRET = os.getenv(\"AICORE_CLIENT_SECRET\") or input(\"AICORE_CLIENT_SECRET is missing. Please enter it: \")\n", + "\n", + "AWS_ACCESS_KEY = os.getenv(\"AWS_ACCESS_KEY\") or input(\"AWS_ACCESS_KEY is missing. Please enter it: \")\n", + "AWS_BUCKET_ID = os.getenv(\"AWS_BUCKET_ID\") or input(\"AWS_BUCKET_ID is missing. Please enter it: \")\n", + "AWS_REGION = os.getenv(\"AWS_REGION\") or input(\"AWS_REGION is missing. Please enter it: \")\n", + "AWS_SECRET_ACCESS_KEY = os.getenv(\"AWS_SECRET_ACCESS_KEY\") or input(\"AWS_SECRET_ACCESS_KEY is missing. Please enter it: \")\n", + "DEPLOYMENT_URL = os.getenv(\"DEPLOYMENT_URL\", None)\n", + "\n", + "# Initializing the GenAIHubProxyClient\n", + "client = GenAIHubProxyClient(\n", + " base_url=AICORE_BASE_URL,\n", + " auth_url=AICORE_AUTH_URL,\n", + " client_id=AICORE_CLIENT_ID,\n", + " client_secret=AICORE_CLIENT_SECRET,\n", + " resource_group=AICORE_RESOURCE_GROUP\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dependencies and Helper Functions (Step 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt configs: ['prompt_template.json']\n", + "Dataset name: medicalqna_dataset.csv\n", + "Prompt template contents: {'template': [{'role': 'user', 'content': 'List the benefits and side effects of the drug in the following consumer health question: {{?question}}.'}]}\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "def get_prompt_config_file(folder_path):\n", + " \"\"\"\n", + " Retrieves a list of all JSON file names in the specified folder.\n", + " \"\"\"\n", + " if not os.path.isdir(folder_path):\n", + " print(f\"The folder path '{folder_path}' does not exist.\")\n", + " return []\n", + "\n", + " json_files = [file for file in os.listdir(folder_path) if file.endswith(\".json\")]\n", + "\n", + " if not json_files:\n", + " print(f\"No JSON files were found in the folder '{folder_path}'.\")\n", + " return json_files\n", + "\n", + "\n", + "def get_dataset_file_name(folder_path):\n", + " \"\"\"\n", + " Retrieves the name of the first file in the specified folder.\n", + " \"\"\"\n", + " if not os.path.isdir(folder_path):\n", + " print(f\"The folder path '{folder_path}' does not exist.\")\n", + " return None\n", + "\n", + " items_in_folder = os.listdir(folder_path)\n", + "\n", + " for item in items_in_folder:\n", + " item_path = os.path.join(folder_path, item)\n", + " if os.path.isfile(item_path):\n", + " return item\n", + "\n", + " print(f\"No files were found in the folder '{folder_path}'.\")\n", + " return None\n", + "\n", + "\n", + "def load_prompt_template(folder_path, file_name):\n", + " \"\"\"\n", + " Loads the contents of a JSON prompt template into a variable.\n", + " \"\"\"\n", + " file_path = os.path.join(folder_path, file_name)\n", + " try:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as f:\n", + " return json.load(f) # returns Python dict\n", + " except Exception as e:\n", + " print(f\"Error loading prompt template: {e}\")\n", + " return None\n", + "\n", + "# --- MAIN EXECUTION ---\n", + "PROMPT_FOLDER = \"./PUT_YOUR_PROMPT_TEMPLATE_HERE\"\n", + "DATASET_FOLDER = \"./PUT_YOUR_DATASET_HERE\"\n", + "\n", + "PROMPT_CONFIG_FILES = get_prompt_config_file(PROMPT_FOLDER)\n", + "DATASET_NAME = get_dataset_file_name(DATASET_FOLDER)\n", + "\n", + "if PROMPT_CONFIG_FILES and DATASET_NAME:\n", + " # Load the first JSON prompt template\n", + " PROMPT_TEMPLATE = load_prompt_template(PROMPT_FOLDER, PROMPT_CONFIG_FILES[0])\n", + " print(f\"Prompt configs: {PROMPT_CONFIG_FILES}\")\n", + " print(f\"Dataset name: {DATASET_NAME}\")\n", + " print(\"Prompt template contents:\", PROMPT_TEMPLATE)\n", + "else:\n", + " print(\"Missing run or dataset file.\")\n", + " raise SystemExit(\"Exiting due to missing run/dataset file.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register an Object Store Secret\n", + "To use the evaluations service, you must register an object store with the name default. Optionally, you can register an additional object store with a name of your choice." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# setup authentication and headers needed for AI Core requests\n", + "def _get_headers():\n", + " headers = {\n", + " \"Authorization\": client.get_ai_core_token(),\n", + " \"AI-Resource-Group\": AICORE_RESOURCE_GROUP,\n", + " \"Content-Type\": \"application/json\",\n", + " }\n", + " return headers" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully deleted object store secret: default\n", + "Successfully deleted object store secret: genai-quick-data-notebook\n" + ] + }, + { + "data": { + "text/plain": [ + "{'message': 'secret has been created'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Register S3 secret with AI Core which will be used an input source \n", + "import requests\n", + "import json\n", + "import logging\n", + "\n", + "def delete_oss_secret(oss_name=\"\"):\n", + " headers = _get_headers()\n", + " \n", + " DELETE_SECRETS_ENDPOINT = f'/v2/admin/objectStoreSecrets/{oss_name}'\n", + " request_url = f\"{AICORE_BASE_URL}{DELETE_SECRETS_ENDPOINT}\"\n", + " \n", + " try:\n", + " response = requests.delete(request_url, headers=headers, timeout=120)\n", + " if response.status_code == 202:\n", + " print(f\"Successfully deleted object store secret: {oss_name}\")\n", + " elif response.status_code == 404:\n", + " print(f\"Object store secret not found: {oss_name}. It may not exist.\")\n", + " else:\n", + " logging.error(f\"Failed to delete object store secret: {oss_name}, Status Code: {response.status_code}\")\n", + " except Exception as e:\n", + " logging.error(f\"Error occurred while attempting to delete object store secret: {e}\")\n", + " raise\n", + "\n", + "def register_oss_secret(oss_name=\"\", path_prefix=\"\"):\n", + " headers = _get_headers()\n", + " \n", + " POST_SECRETS_ENDPOINT = '/v2/admin/objectStoreSecrets'\n", + " request_url = f\"{AICORE_BASE_URL}{POST_SECRETS_ENDPOINT}\"\n", + " \n", + " request_body = {\n", + " \"name\": oss_name,\n", + " \"data\": {\n", + " \"AWS_ACCESS_KEY_ID\": AWS_ACCESS_KEY,\n", + " \"AWS_SECRET_ACCESS_KEY\": AWS_SECRET_ACCESS_KEY\n", + " },\n", + " \"type\": \"S3\",\n", + " \"bucket\": AWS_BUCKET_ID,\n", + " \"endpoint\": \"s3-eu-central-1.amazonaws.com\",\n", + " \"region\": AWS_REGION,\n", + " \"pathPrefix\": path_prefix,\n", + " \"verifyssl\": \"0\",\n", + " \"usehttps\": \"1\",\n", + " }\n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " result = response.json()\n", + " return result\n", + " except:\n", + " logging.error(\"Error occurred while attempting to create object store secret\")\n", + " raise\n", + " \n", + "delete_oss_secret(oss_name=\"default\")\n", + "delete_oss_secret(oss_name=\"genai-quick-data-notebook\")\n", + " \n", + "register_oss_secret(oss_name=\"default\", path_prefix=\"\")\n", + "register_oss_secret(oss_name=\"genai-quick-data-notebook\", path_prefix=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val of root is medicalqna_dataset.csv\n", + "val of s3 key is genaiEvaluation/I321506/testdata/medicalqna_dataset.csv\n", + "Uploading ./PUT_YOUR_DATASET_HERE\\medicalqna_dataset.csv to s3://hcp-b60330de-a879-4848-9a3d-0ac828f4517c/genaiEvaluation/I321506/testdata/medicalqna_dataset.csv\n" + ] + } + ], + "source": [ + "# uploading these files to Object store to register as an artifact inside ai core\n", + "\n", + "import boto3\n", + "import os\n", + "import uuid\n", + "\n", + "def upload_folder_to_s3(folder_path, bucket_name, s3_prefix=\"\"):\n", + " \"\"\"\n", + " Upload a folder to an S3 bucket recursively.\n", + "\n", + " :param folder_path: The local folder path to upload.\n", + " :param bucket_name: The name of the S3 bucket.\n", + " :param s3_prefix: Optional prefix to use for the S3 keys (e.g., subfolder in the bucket).\n", + " \"\"\"\n", + " s3_client = boto3.client(\n", + " 's3',\n", + " aws_access_key_id=AWS_ACCESS_KEY,\n", + " aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n", + " region_name=AWS_REGION\n", + " )\n", + "\n", + " for root, dirs, files in os.walk(folder_path):\n", + " for file_name in files:\n", + " print(\"val of root is \", file_name)\n", + " local_path = os.path.join(root, file_name)\n", + " # Compute the relative path for the S3 key\n", + " relative_path = os.path.relpath(local_path, folder_path)\n", + " s3_key = os.path.join(s3_prefix, relative_path).replace(\"\\\\\", \"/\") # Ensure S3-compatible paths\n", + " print(\"val of s3 key is \", s3_key)\n", + " print(f\"Uploading {local_path} to s3://{bucket_name}/{s3_key}\")\n", + " \n", + " # Upload the file\n", + " s3_client.upload_file(local_path, bucket_name, s3_key)\n", + "\n", + "# Example usage\n", + "folder_to_upload_testdata = \"./PUT_YOUR_DATASET_HERE\"\n", + "folder_to_upload_custommetric = \"./PUT_YOUR_CUSTOM_METRIC_HERE\"\n", + "user_directory_prefix = \"I321506\" # replace with your i-number as string here\n", + "prefix_guid = user_directory_prefix if user_directory_prefix is not None else str(uuid.uuid4().hex)\n", + "s3_testdata_prefix = f\"genaiEvaluation/{prefix_guid}/testdata\" # Leave empty for root of the bucket\n", + "\n", + "\n", + "upload_folder_to_s3(folder_to_upload_testdata, AWS_BUCKET_ID, s3_testdata_prefix)\n", + "input_artifact_path = f\"ai://genai-quick-data-notebook/genaiEvaluation/{prefix_guid}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The user stores the input files in the object store and registers the root folder as artifact with AI Core. The File Upload and Artifact endpoints of AI Core API may be used for this purpose. In this example `genaiEvaluation\\{prefix_guid}` is the root folder containing the orchestration configurations and test data which is registered as AI Core artifact." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': '75dd115f-1707-4de6-9031-ce22d8c12015', 'message': 'Artifact acknowledged', 'url': 'ai://genai-quick-data-notebook/genaiEvaluation/I321506'}\n" + ] + } + ], + "source": [ + "import requests\n", + "import logging\n", + "# Registering the uploaded files from AWS as artifacts to use inside configuration.\n", + "\n", + "def register_artifact():\n", + " headers = _get_headers()\n", + " \n", + " GET_ARTIFACTS_ENDPOINT = '/v2/lm/artifacts'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_ARTIFACTS_ENDPOINT}\"\n", + " \n", + " request_body = {\n", + " \"labels\": [\n", + " {\n", + " \"key\": \"ext.ai.sap.com/prompt-evaluation\",\n", + " \"value\": \"true\"\n", + " }\n", + " ],\n", + " \"name\": \"genai-eval-simplified-test-data\",\n", + " \"kind\": \"other\",\n", + " \"url\": input_artifact_path, # input artifact path\n", + " \"description\": \"demo artifacts for evaluation flow.\",\n", + " \"scenarioId\": \"genai-evaluations\"\n", + " }\n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " except:\n", + " print(\"Error occurred while attempting to create an execution\")\n", + " raise\n", + " \n", + "\n", + "artifact_id = register_artifact()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Orchestration Deployment\n", + "An orchestration Deployment URL is required for us to run our evaluation. Once created we need to wait until the deployment is running and provides us a deployment url which will be add to our configuration file in the next step. You can skip this step if you already have a orchestration deployment running." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "import time\n", + "\n", + "\n", + "\n", + "def create_orchestration_configuration():\n", + " headers = _get_headers()\n", + " request_body = {\n", + " \"name\": \"orchestrationDeployment\",\n", + " \"executableId\": \"orchestration\",\n", + " \"scenarioId\": \"orchestration\",\n", + " \"parameterBindings\": [\n", + " {\n", + " \"key\": \"modelFilterList\",\n", + " \"value\": \"null\"\n", + " },\n", + " {\n", + " \"key\": \"modelFilterListType\",\n", + " \"value\": \"allow\"\n", + " }\n", + " ],\n", + " \"inputArtifactBindings\": []\n", + " }\n", + " \n", + " GET_CONFIGURATIONS_ENDPOINT = '/v2/lm/configurations'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_CONFIGURATIONS_ENDPOINT}\"\n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " print(response)\n", + " if(response.status_code != 201):\n", + " raise\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " except:\n", + " logging.error(\"Error occurred while attempting to create a Configuration\")\n", + " raise\n", + " \n", + "def execute_orchestration_deployment(configuration_id):\n", + " headers = _get_headers()\n", + " GET_DEPLOYMENTS_ENDPOINT = '/v2/lm/deployments'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_DEPLOYMENTS_ENDPOINT}\"\n", + " \n", + " request_body = {\n", + " \"configurationId\": configuration_id\n", + " }\n", + " \n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " print(response)\n", + " if(response.status_code != 202):\n", + " print(\"Deployment execution failed\")\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " \n", + " except:\n", + " logging.error(\"Error occurred while attempting to create an execution\")\n", + " raise\n", + "\n", + "def get_deployment_status(orchestration_deployment_id):\n", + " headers = _get_headers()\n", + " api_url = f\"{AICORE_BASE_URL}/v2/lm/deployments/{orchestration_deployment_id}?$select=status\"\n", + " timeout = 400 \n", + " initial_interval = 30 \n", + " pending_interval = 10\n", + " start = time.time()\n", + "\n", + " status = None\n", + " current_interval = initial_interval\n", + "\n", + " while time.time() - start < timeout:\n", + " response = requests.get(api_url, headers=headers)\n", + " if response.status_code == 200:\n", + " status = response.json().get('status')\n", + " print(f\"Deployment {orchestration_deployment_id} status: {status}\")\n", + " # Adjust polling interval based on status\n", + " if status == 'RUNNING':\n", + " return True\n", + " elif status == 'UNKNOWN':\n", + " current_interval = initial_interval\n", + " elif status == 'PENDING':\n", + " current_interval = pending_interval\n", + "\n", + " else:\n", + " print(f\"Failed to fetch deployment status. HTTP {response.status_code}\")\n", + " return False\n", + "\n", + " # Waiting according to status for API call\n", + " time.sleep(current_interval)\n", + "\n", + "def get_deployment_url(orchestration_deployment_id):\n", + " headers = _get_headers()\n", + " response = requests.get(f\"{AICORE_BASE_URL}/v2/lm/deployments/{orchestration_deployment_id}\", headers=headers)\n", + " if response.status_code != 200:\n", + " raise Exception(f\"Failed to get deployment URL: {response.status_code} - {response.text}\")\n", + " return response.json().get('deploymentUrl')\n", + "\n", + "# You can skip this step if you already have a orchestration deployment running\n", + "deployment_url = DEPLOYMENT_URL\n", + "if not deployment_url:\n", + " configuration_id = create_orchestration_configuration()\n", + " orchestration_deployment_id = execute_orchestration_deployment(configuration_id)\n", + " is_running = get_deployment_status(orchestration_deployment_id) \n", + " if is_running:\n", + " deployment_url = get_deployment_url(orchestration_deployment_id)\n", + " print(f\"Deployment URL: {deployment_url}\")\n", + " else:\n", + " print(\"Deployment is not running or failed.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Approach Selection\n", + "\n", + "Select whether to use `Prompt Registry` or `Orchestration Registry` approach" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "50f821a5ff974acebf2de68badce0bd3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='Please select the configuration mode:', layout=Layout(margin='10px 0px 10px 0px'))…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from ipywidgets import Checkbox, VBox, HBox, Output, Label, Layout\n", + "from IPython.display import display\n", + "import textwrap\n", + "\n", + "# --- Selection state ---\n", + "approach = None\n", + "suppress_update = False \n", + "\n", + "# --- Define options ---\n", + "flag_options = [\n", + " \"prompt_registry\",\n", + " \"orchestration_registry\"\n", + "]\n", + "\n", + "# --- Output widget to show current selection ---\n", + "\n", + "output = Output(layout=Layout(border=\"1px solid black\", height=\"70px\", overflow=\"auto\", width=\"900px\"))\n", + "\n", + "\n", + "# --- Handler for checkbox changes ---\n", + "def on_flag_change(change):\n", + " global approach, suppress_update\n", + " if suppress_update:\n", + " return\n", + "\n", + " if change[\"new\"]: # A checkbox was checked\n", + " suppress_update = True\n", + " # Uncheck all other checkboxes\n", + " for cb in checkboxes:\n", + " if cb.description != change[\"owner\"].description:\n", + " cb.value = False\n", + " suppress_update = False\n", + " approach = change[\"owner\"].description\n", + " else:\n", + " # Only clear if the unchecked one was the currently selected\n", + " if approach == change[\"owner\"].description:\n", + " approach = None\n", + "\n", + " # Update display once per action\n", + " with output:\n", + " output.clear_output(wait=True)\n", + " msg = f\"Selected approach: {approach or 'None'}\"\n", + " wrapped = textwrap.fill(msg, width=60)\n", + " output.append_stdout(wrapped + \"\\n\")\n", + "\n", + "# --- Create checkboxes ---\n", + "checkboxes = [\n", + " Checkbox(value=False, description=option, layout=Layout(width=\"250px\"))\n", + " for option in flag_options\n", + "]\n", + "\n", + "# --- Attach event handler ---\n", + "for cb in checkboxes:\n", + " cb.observe(on_flag_change, names=\"value\")\n", + "\n", + "# --- Display UI ---\n", + "header = Label(\n", + " value=\"Please select the configuration mode:\",\n", + " layout=Layout(margin=\"10px 0px 10px 0px\")\n", + ")\n", + "ui = VBox([header, HBox(checkboxes), output])\n", + "display(ui)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "orchestration_registry\n" + ] + } + ], + "source": [ + "# Select the approach, either \"prompt_registry\" or orchestration_registry\"\n", + "print(approach)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a Prompt Template in Prompt Registry (Prompt Registry Approach)\n", + "\n", + "The following code defines a function `create_prompt_template()` that creates a new **Prompt Template** in the SAP AI Core **Prompt Registry**.\n", + "\n", + "**Note** : If you wish to use a prompt template that already exists in prompt registry, you can manually set `prompt_template_id` in the next cell and skip executing this cell" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'message': 'Prompt updated successfully.', 'id': 'cf1908d7-c793-497d-ab8c-4e220f41102b', 'scenario': 'genai-evaluations', 'name': 'prompt-registry-eval-acc-test', 'version': '1.0.0'}\n" + ] + } + ], + "source": [ + "def create_prompt_template():\n", + " headers = _get_headers()\n", + " GET_PROMPT_TEMPLATES_ENDPOINT = '/v2/lm/promptTemplates'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_PROMPT_TEMPLATES_ENDPOINT}\"\n", + " \n", + " request_body = {\n", + " \"name\": \"prompt-registry-eval-acc-test\",\n", + " \"version\": \"1.0.0\",\n", + " \"scenario\": \"genai-evaluations\",\n", + " \"spec\": PROMPT_TEMPLATE\n", + " }\n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " if(response.status_code != 200):\n", + " raise\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " except:\n", + " logging.error(\"Error occurred while attempting to create a prompt template\")\n", + " raise\n", + "\n", + "prompt_template_id = create_prompt_template()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Manually set prompt_template_id here if you wish to use pre existing prompt template\n", + "# prompt_template_id=\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select your metrics\n", + " \n", + "Tick the metrics you wish to use. If the widget does not load properly, you can manually fill in the string `selected_metrics_str`\n", + "\n", + "**Note: If your dataset does not have a reference column, DO NOT Select metrcis where reference is required.**" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4a1db8c29df04fc3b03700fca7ba83a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='Please choose which metrics you want to run', layout=Layout(margin='10px 0px 10px …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from ipywidgets import Checkbox, VBox, HBox, Output, Label, Layout\n", + "import textwrap\n", + "\n", + "\n", + "# List to store selected values\n", + "selected_values = []\n", + "\n", + "# Original list of checkbox options\n", + "checkbox_options = [\n", + " \"Content Filter on Input\",\n", + " \"Content Filter on Output\",\n", + " \"Pointwise Instruction Following\",\n", + " \"Pointwise Answer Relevance\",\n", + " \"Pointwise Conciseness\"\n", + "]\n", + "\n", + "# Additional options with \"(reference required)\" in their description\n", + "additional_options = [\n", + " f\"{option} (reference required)\" for option in [\"BLEU\", \"ROUGE\", \"BERT Score\", \"Pointwise Correctness\"]\n", + "]\n", + "\n", + "# Combine both lists\n", + "all_checkbox_options = checkbox_options + additional_options\n", + "\n", + "# Function to handle checkbox changes\n", + "def on_checkbox_change(change):\n", + " global selected_metrics_str # Declare the string version of selected_values as global\n", + " if change['new']: # If the checkbox is checked\n", + " # Remove \"(reference required)\" before adding to the list\n", + " metric_name = change['owner'].description.replace(\" (reference required)\", \"\")\n", + " if metric_name not in selected_values:\n", + " selected_values.append(metric_name)\n", + " else: # If the checkbox is unchecked\n", + " # Remove \"(reference required)\" before removing from the list\n", + " metric_name = change['owner'].description.replace(\" (reference required)\", \"\")\n", + " if metric_name in selected_values:\n", + " selected_values.remove(metric_name)\n", + " # Convert the list to a comma-separated string\n", + " selected_metrics_str = \",\".join(selected_values)\n", + " # Display the updated list with wrapped text\n", + " with output:\n", + " output.clear_output(wait=True) # Clear the output before printing\n", + " wrapped_text = textwrap.fill(f\"Selected values: {selected_values}\", width=80)\n", + " output.append_stdout(wrapped_text + \"\\n\") # Write directly to the output widget\n", + "\n", + "# Create checkboxes for the combined options with wider layout\n", + "checkboxes = [\n", + " Checkbox(value=False, description=option, layout=Layout(width=\"900px\")) for option in all_checkbox_options\n", + "]\n", + "\n", + "# Attach the change handler to each checkbox\n", + "for checkbox in checkboxes:\n", + " checkbox.observe(on_checkbox_change, names='value')\n", + "\n", + "# Group checkboxes into rows (2 per row)\n", + "rows = [HBox(checkboxes[i:i+2]) for i in range(0, len(checkboxes), 2)]\n", + "\n", + "# Output widget to display the selected values with a scrollable and wrapped area\n", + "output = Output(layout=Layout(border=\"1px solid black\", height=\"150px\", overflow=\"auto\", width=\"900px\"))\n", + "\n", + "# Header label\n", + "header = Label(value=\"Please choose which metrics you want to run\", layout=Layout(margin=\"10px 0px 10px 0px\"))\n", + "\n", + "# Display the header, checkboxes in rows, and the output\n", + "display(VBox([header] + rows + [output]))\n", + "\n", + "# Initialize the string version of selected_values\n", + "selected_metrics_str = \",\".join(selected_values)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BERT Score,Pointwise Conciseness\n" + ] + } + ], + "source": [ + "# Manual Selection of Metrics\n", + "#selected_metrics_str = \"rouge\"\n", + "print(selected_metrics_str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script checks for an evaluation metric in SAP AI Core.\n", + "\n", + "1. You can provide Metric ID's directly by setting the variable as comma separated string:\n", + " user_metric_ids = `\"\"`\n", + " - ✅ If the ID exists, it will be returned.\n", + " \n", + "2. Script reads all `.json` and `.jsonl` files from `CUSTOM_METRIC_FOLDER` to load the custom metrics\n", + " -The loaded metrics are stored in `custom_metric_list'\n", + " - The script will use the contents of the `custom_metric_list`\n", + " to search for an existing metric by scenario + name + version.\n", + "\n", + "3. If no existing metric is found:\n", + " - A new metric will be created using the details in `custom_metric_list`.\n", + " - Required fields in custom_metric: scenario, name, version, evaluationMethod.\n", + "\n", + "4. At the end:\n", + " - The script prints the final Metric ID that was found or created.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Metric already exists: genai-evaluations/groundedness v0.0.1, ID = 2b3cc135-a031-4d93-8641-1f3833797034\n", + "✅ Metric already exists: genai-evaluations/groundedness v0.1.6, ID = 9b349f8e-cd39-486d-809c-3dcfa3b15ac7\n", + "⚠️ User metric ID d1868b00-1601-407a-92cd-0b9065682d1f does not exist in AI Core, skipping.\n", + "⚠️ User metric ID dbf56851-8444-45d3-a0c1-adbe210c7e771 does not exist in AI Core, skipping.\n", + "✅ All processed metric IDs: 2b3cc135-a031-4d93-8641-1f3833797034,9b349f8e-cd39-486d-809c-3dcfa3b15ac7\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import requests\n", + "\n", + "# --- Load JSON / JSONL files ---\n", + "def load_all_metrics(folder_path):\n", + " \"\"\"\n", + " Loads all JSON and JSONL files from a folder into a single list of dicts.\n", + " \"\"\"\n", + " metrics = []\n", + " files = [f for f in os.listdir(folder_path) if f.endswith((\".json\", \".jsonl\"))]\n", + "\n", + " if not files:\n", + " print(f\"No JSON/JSONL files found in {folder_path}\")\n", + " return metrics\n", + "\n", + " for file_name in files:\n", + " file_path = os.path.join(folder_path, file_name)\n", + " try:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as f:\n", + " content = f.read().strip()\n", + " try:\n", + " data = json.loads(content)\n", + " if isinstance(data, list):\n", + " metrics.extend(data)\n", + " elif isinstance(data, dict):\n", + " metrics.append(data)\n", + " except json.JSONDecodeError:\n", + " # Attempt to parse as JSONL line by line\n", + " for line in content.splitlines():\n", + " line = line.strip()\n", + " if not line:\n", + " continue\n", + " try:\n", + " metrics.append(json.loads(line))\n", + " except json.JSONDecodeError:\n", + " print(f\"Skipping invalid JSON line in {file_name}: {line[:50]}...\")\n", + " except Exception as e:\n", + " print(f\"Error reading {file_name}: {e}\")\n", + " return metrics\n", + "\n", + "# --- Fetch all metrics from SAP AI Core ---\n", + "def fetch_all_metrics():\n", + " request_url = f\"{AICORE_BASE_URL}/v2/lm/evaluationMetrics\"\n", + " resp = requests.get(request_url, headers=_get_headers())\n", + " resp.raise_for_status()\n", + " return resp.json().get(\"resources\", [])\n", + "\n", + "# --- Create or fetch a metric ---\n", + "def create_or_get_metric(custom_metric, user_metric_id=None):\n", + " all_metrics = fetch_all_metrics()\n", + "\n", + " # 1️⃣ User-supplied ID lookup\n", + " if user_metric_id:\n", + " for m in all_metrics:\n", + " if m.get(\"id\") == user_metric_id:\n", + " print(f\"✅ Metric already exists by ID: {user_metric_id}\")\n", + " return user_metric_id\n", + " print(f\"⚠️ User metric ID {user_metric_id} not found, will only include if valid later\")\n", + "\n", + " # 2️⃣ Check by scenario, name, version\n", + " scenario = custom_metric.get(\"scenario\")\n", + " name = custom_metric.get(\"name\")\n", + " version = custom_metric.get(\"version\")\n", + " if not all([scenario, name, version]):\n", + " raise ValueError(\"Metric must include 'scenario', 'name', and 'version'\")\n", + "\n", + " for m in all_metrics:\n", + " if (m.get(\"scenario\") == scenario and\n", + " m.get(\"name\") == name and\n", + " m.get(\"version\") == version):\n", + " metric_id = m.get(\"id\")\n", + " print(f\"✅ Metric already exists: {scenario}/{name} v{version}, ID = {metric_id}\")\n", + " return metric_id\n", + "\n", + " # 3️⃣ Create metric if not found\n", + " request_url = f\"{AICORE_BASE_URL}/v2/lm/evaluationMetrics\"\n", + " required_fields = [\"scenario\", \"name\", \"version\", \"evaluationMethod\", \"metricType\"]\n", + " for f in required_fields:\n", + " if f not in custom_metric:\n", + " raise ValueError(f\"❌ Missing required field: {f}\")\n", + "\n", + " resp = requests.post(request_url, headers=_get_headers(), json=custom_metric)\n", + " resp.raise_for_status()\n", + " metric_id = resp.json().get(\"id\")\n", + " print(f\"✅ Metric created successfully: {name} v{version}, ID = {metric_id}\")\n", + " return metric_id\n", + "\n", + "# --- Main pipeline ---\n", + "CUSTOM_METRIC_FOLDER = \"./PUT_YOUR_CUSTOM_METRIC_HERE\"\n", + "user_metric_ids = \"d1868b00-1601-407a-92cd-0b9065682d1f,dbf56851-8444-45d3-a0c1-adbe210c7e771\" # set by user if needed\n", + "\n", + "# 1️⃣ Load all metrics from JSON/JSONL\n", + "custom_metric_list = load_all_metrics(CUSTOM_METRIC_FOLDER)\n", + "\n", + "# 2️⃣ Create/fetch metrics from SAP AI Core\n", + "metric_ids = []\n", + "for metric in custom_metric_list:\n", + " try:\n", + " metric_id = create_or_get_metric(metric)\n", + " metric_ids.append(metric_id)\n", + " except ValueError as e:\n", + " print(f\"Skipping metric due to error: {e}\")\n", + "\n", + "# 3️⃣ Validate user_metric_ids separately if provided\n", + "if user_metric_ids and user_metric_ids.strip():\n", + " all_metrics = fetch_all_metrics()\n", + " # Split comma-separated IDs and strip whitespace\n", + " for uid in [uid.strip() for uid in user_metric_ids.split(\",\")]:\n", + " if any(m.get(\"id\") == uid for m in all_metrics):\n", + " metric_ids.append(uid)\n", + " else:\n", + " print(f\"⚠️ User metric ID {uid} does not exist in AI Core, skipping.\")\n", + "# 4️⃣ Convert to comma-separated string\n", + "custom_metric_ids_str = \",\".join(metric_ids)\n", + "print(\"✅ All processed metric IDs:\", custom_metric_ids_str)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select your Models\n", + " \n", + "Tick the metrics you wish to use. If the widget does not load properly, you can manually fill in the string `selected_models_str`\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1ab806f2179a4deab49fd0cd761a863a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='Please choose which LLM models you want to run:', layout=Layout(margin='10px 0px 1…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import requests\n", + "import textwrap\n", + "from functools import partial\n", + "from ipywidgets import Checkbox, VBox, HBox, Output, Label, Layout\n", + "from IPython.display import display\n", + "\n", + "\n", + "# --- Call the API ---\n", + "GET_MODELS_ENDPOINT = '/v2/lm/scenarios/foundation-models/models'\n", + "request_url = f\"{AICORE_BASE_URL}{GET_MODELS_ENDPOINT}\"\n", + "headers = _get_headers() # your existing function\n", + "response = requests.get(request_url, headers=headers)\n", + "models_data = response.json()\n", + "# --- Extract model options ---\n", + "llm_options = []\n", + "resources = models_data.get(\"resources\", [])\n", + "for m in resources:\n", + " display_name = m.get(\"displayName\") or m.get(\"name\") or m.get(\"model\", \"Unknown\")\n", + " model_id = m.get(\"model\", m.get(\"id\", \"unknown\"))\n", + "\n", + " versions = m.get(\"versions\", [])\n", + " version = \"latest\"\n", + " if versions:\n", + " latest = next((v for v in versions if v.get(\"isLatest\")), versions[0])\n", + " version = latest.get(\"name\") or \"latest\"\n", + "\n", + " label = f\"{display_name} ({model_id}, v:{version})\"\n", + " value = f\"{model_id}:{version}\"\n", + " llm_options.append({\"label\": label, \"value\": value})\n", + "\n", + "# --- Selection state ---\n", + "selected_models = []\n", + "selected_models_str = \"\" # <-- your comma-separated string\n", + "output = Output(layout=Layout(border=\"1px solid black\", height=\"150px\", overflow=\"auto\", width=\"900px\"))\n", + "\n", + "def update_output():\n", + " global selected_models_str\n", + " selected_models_str = \",\".join(selected_models)\n", + " with output:\n", + " output.clear_output(wait=True)\n", + " if selected_models_str:\n", + " wrapped_text = textwrap.fill(f\"Selected models: {selected_models_str}\", width=80)\n", + " output.append_stdout(wrapped_text + \"\\n\")\n", + "\n", + "# --- Callback handler ---\n", + "def on_checkbox_change(model_value, change):\n", + " # Ignore redundant triggers\n", + " if change[\"old\"] == change[\"new\"]:\n", + " return\n", + "\n", + " if approach == \"prompt_registry\":\n", + " # Multiple selections allowed\n", + " if change[\"new\"]:\n", + " if model_value not in selected_models:\n", + " selected_models.append(model_value)\n", + " else:\n", + " if model_value in selected_models:\n", + " selected_models.remove(model_value)\n", + " else:\n", + " # Single selection only\n", + " if change[\"new\"]:\n", + " # Uncheck all other boxes\n", + " for cb in checkboxes:\n", + " if cb.model_value != model_value:\n", + " cb.unobserve_all()\n", + " cb.value = False\n", + " cb.observe(partial(on_checkbox_change, cb.model_value), names=\"value\")\n", + " selected_models.clear()\n", + " selected_models.append(model_value)\n", + " else:\n", + " if model_value in selected_models:\n", + " selected_models.remove(model_value)\n", + "\n", + " update_output()\n", + "\n", + "# --- Create checkboxes + labels ---\n", + "checkboxes = []\n", + "checkbox_rows = []\n", + "for opt in llm_options:\n", + " cb = Checkbox(value=False, indent=False, layout=Layout(width=\"30px\"))\n", + " cb.model_value = opt[\"value\"] # attach model identifier\n", + " cb.observe(partial(on_checkbox_change, opt[\"value\"]), names=\"value\")\n", + " lbl = Label(value=opt[\"label\"], layout=Layout(width=\"850px\"))\n", + " checkboxes.append(cb)\n", + " checkbox_rows.append(HBox([cb, lbl]))\n", + "\n", + "# --- Display ---\n", + "header = Label(value=\"Please choose which LLM models you want to run:\", layout=Layout(margin=\"10px 0px 10px 0px\"))\n", + "display(VBox([header] + checkbox_rows + [output]))\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected models string: gpt-4o:2024-08-06\n" + ] + } + ], + "source": [ + "# Manual selection of models\n", + "# selected_models_str=\"gpt-4o:2024-08-06\"\n", + "print(\"Selected models string:\", selected_models_str)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create orchestration registry config (Only needed if you chose orchestraion registry approach)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'message': 'Orchestration config updated successfully.', 'id': '96f7424b-496d-4c47-8f14-0c6562e7744f', 'scenario': 'genai-evaluations', 'name': 'genai-eval-test', 'version': '1.0.0'}\n" + ] + } + ], + "source": [ + "def create_orchestration_registry_config():\n", + " headers = _get_headers()\n", + " CREATE_ORCHESTRATION_REGISTRY = '/v2/registry/v2/orchestrationConfigs'\n", + " request_url = f\"{AICORE_BASE_URL}{CREATE_ORCHESTRATION_REGISTRY}\"\n", + " model_name,model_version=selected_models_str.split(\":\")\n", + " request_body = {\n", + " \"name\": \"genai-eval-test\",\n", + " \"version\": \"1.0.0\",\n", + " \"scenario\": \"genai-evaluations\",\n", + " \"spec\": {\n", + " \"modules\": {\n", + " \"prompt_templating\": {\n", + " \"model\": {\n", + " \"name\": model_name,\n", + " \"version\": model_version\n", + " },\n", + " \"prompt\": PROMPT_TEMPLATE\n", + " }\n", + " }\n", + " }\n", + " }\n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " if(response.status_code != 200):\n", + " print(response.json())\n", + " raise\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " except:\n", + " logging.error(\"Error occurred while attempting to create a orchestration registry id\")\n", + " raise\n", + "orchestration_registry_id = create_orchestration_registry_config()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start Evaluation Run (Step 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected metrics: BERT Score,Pointwise Conciseness,2b3cc135-a031-4d93-8641-1f3833797034,9b349f8e-cd39-486d-809c-3dcfa3b15ac7\n", + "Selected models: gpt-4o:2024-08-06\n" + ] + } + ], + "source": [ + "\n", + "import json\n", + "test_data_path = f\"testdata/{DATASET_NAME}\" # specify the test data path here. For the full folder just specifying testdata will work\n", + "test_datasets = json.dumps({'path': test_data_path, 'type': 'csv'})\n", + "metrics_list = \",\".join([selected_metrics_str,custom_metric_ids_str])\n", + "models_list = selected_models_str\n", + "print(f\"Selected metrics: {metrics_list}\")\n", + "print(f\"Selected models: {models_list}\")\n", + "#variable_mapping = json.dumps({'prompt/question': 'data/topic'}) # to map the question prompt variable to the entry in dataset.\n", + "# orchestration_deployment_url = deployment_url # needs to specify this to use a specific deployment id\n", + "orchestration_deployment_url = \"https://api.ai.internalprod.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/d92895b42bdd6175\"\n", + "repetitions = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "{'id': 'fc21348f-4232-4579-8d21-062b88328ef0', 'message': 'Configuration created'}\n" + ] + } + ], + "source": [ + "# creating an AICORE Configuration.\n", + "import requests\n", + "\n", + "request_body = {\n", + " \"name\": \"genai-eval-conf\",\n", + " \"scenarioId\": \"genai-evaluations\",\n", + " \"executableId\": \"genai-evaluations-simplified\",\n", + " \"inputArtifactBindings\": [\n", + " {\n", + " \"key\": \"datasetFolder\",\n", + " \"artifactId\": artifact_id\n", + " }\n", + " ],\n", + " \"parameterBindings\": [\n", + " {\n", + " \"key\": \"repetitions\",\n", + " \"value\": repetitions\n", + " },\n", + " {\n", + " \"key\": \"orchestrationDeploymentURL\",\n", + " \"value\": orchestration_deployment_url\n", + " },\n", + " {\n", + " \"key\": \"metrics\",\n", + " \"value\": metrics_list\n", + " },\n", + " {\n", + " \"key\": \"testDataset\",\n", + " \"value\": test_datasets\n", + " },\n", + " {\n", + " \"key\": \"promptTemplate\",\n", + " \"value\": prompt_template_id if approach == \"prompt_registry\" else \"\"\n", + " },\n", + " {\n", + " \"key\": \"models\",\n", + " \"value\": models_list if approach == \"prompt_registry\" else \"\"\n", + " },\n", + " {\n", + " \"key\": \"orchestrationRegistryIds\",\n", + " \"value\": orchestration_registry_id if approach == \"orchestration_registry\" else \"\"\n", + " }\n", + " ]\n", + "}\n", + "\n", + "def create_aicore_configuration():\n", + " headers = _get_headers()\n", + " GET_CONFIGURATIONS_ENDPOINT = '/v2/lm/configurations'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_CONFIGURATIONS_ENDPOINT}\"\n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " print(response)\n", + " if(response.status_code != 201):\n", + " raise\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " except:\n", + " logging.error(\"Error occurred while attempting to create a Configuration\")\n", + " raise\n", + " \n", + "configuration_id = create_aicore_configuration()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation Execution Creation\n", + "Once Configration is create, we create the AI Core execution which triggers the evaluation workload.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "response received is \n", + "{'id': 'e8d0adb660289477', 'message': 'Execution scheduled', 'status': 'UNKNOWN', 'targetStatus': 'COMPLETED'}\n" + ] + } + ], + "source": [ + "# create an execution with the created configuration.\n", + "\n", + "import requests\n", + "def create_execution():\n", + " headers = _get_headers()\n", + " GET_EXECUTIONS_ENDPOINT = '/v2/lm/executions'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_EXECUTIONS_ENDPOINT}\"\n", + " request_body = {\"configurationId\" : configuration_id} \n", + " try:\n", + " response = requests.post(\n", + " request_url, headers=headers, data=json.dumps(request_body), timeout=120\n", + " )\n", + " print(\"response received is \", response)\n", + " result = response.json()\n", + " print(result)\n", + " return result['id']\n", + " except:\n", + " logging.error(\"Error occurred while attempting to create an execution\")\n", + " raise\n", + " \n", + "\n", + "execution_id = create_execution()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "response received is \n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': 'e8d0adb660289477',\n", + " 'createdAt': '2025-11-14T05:36:34Z',\n", + " 'modifiedAt': '2025-11-14T05:36:34Z',\n", + " 'status': 'COMPLETED',\n", + " 'scenarioId': 'genai-evaluations',\n", + " 'configurationId': 'fc21348f-4232-4579-8d21-062b88328ef0',\n", + " 'targetStatus': 'COMPLETED',\n", + " 'submissionTime': '2025-11-14T05:36:56Z',\n", + " 'startTime': '2025-11-14T05:36:56Z',\n", + " 'completionTime': '2025-11-14T05:52:15Z',\n", + " 'configurationName': 'genai-eval-conf',\n", + " 'executableId': 'genai-evaluations-simplified',\n", + " 'outputArtifacts': [{'id': '80dfbccc-b559-496c-a03a-4d21b18c11be',\n", + " 'createdAt': '2025-11-14T05:52:06Z',\n", + " 'modifiedAt': '2025-11-14T05:52:06Z',\n", + " 'url': 'ai://default/e8d0adb660289477/evaluation_result',\n", + " 'name': 'evaluation_result',\n", + " 'kind': 'resultset',\n", + " 'description': '',\n", + " 'scenarioId': 'genai-evaluations',\n", + " 'executionId': 'e8d0adb660289477'},\n", + " {'id': '2cb535ce-33d0-482d-a673-ba66b0e1df6f',\n", + " 'createdAt': '2025-11-14T05:50:23Z',\n", + " 'modifiedAt': '2025-11-14T05:50:23Z',\n", + " 'url': 'ai://default/e8d0adb660289477/debug_files',\n", + " 'name': 'debug_files',\n", + " 'kind': 'resultset',\n", + " 'description': '',\n", + " 'scenarioId': 'genai-evaluations',\n", + " 'executionId': 'e8d0adb660289477'}]}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get execution status\n", + "import requests\n", + "def get_execution_status(execution_id):\n", + " headers = _get_headers()\n", + " LOG_EXECUTIONS_ENDPOINT = f'/v2/lm/executions/{execution_id}'\n", + " request_url = f\"{AICORE_BASE_URL}{LOG_EXECUTIONS_ENDPOINT}\"\n", + " try:\n", + " response = requests.get(\n", + " request_url, headers=headers, timeout=120\n", + " )\n", + " print(\"response received is \", response)\n", + " result = response.json()\n", + " return result\n", + " except:\n", + " logging.error(\"Error occurred while attempting to get execution status\")\n", + " raise\n", + " \n", + "\n", + "get_execution_status(execution_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "1. Run the following cells only when the status field in the Execution response is \"COMPLETED\" to view the results.\n", + "2. The status field progresses through different states over time: UNKNOWN → PENDING → RUNNING → COMPLETED. Ensure it reaches COMPLETED before proceeding.\n", + "\n", + "\n", + "Note: The targetStatus will always be COMPLETED from the start, as it represents the intended final state of the Execution. Do not confuse it with the actual status field.\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation Result (Step 4)\n", + "The evaluation job produces two outputs\n", + "1. A SQLite DB file which stores the orchestration input, orchestration output, values for all the metrics calculated for this orchestration output and statistics such as latency for this orchestration output. These metric values are called raw metric values. This SQLite DB file is stored in the object store as an AI Core output artifact.\n", + "2. A set of metrics whose values are aggregated from the raw metric values. The aggregate metrics are stored in the tracking service. The user-defined tags along with the run names are stored with the metrics.\n", + "Post execution completion user can see the runs generated by the workload along with the aggregate metrics by calling the tracking api as show below" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "response received is \n" + ] + } + ], + "source": [ + "# Get aggregate metrics using execution id\n", + "import requests\n", + "def retrieve_aggregate_metrics(execution_id):\n", + " headers = _get_headers()\n", + " GET_METRICS_ENDPOINT = f'/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/child-of={execution_id}'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_METRICS_ENDPOINT}\"\n", + " try:\n", + " response = requests.get(request_url, headers=headers, timeout=120)\n", + " print(\"response received is \", response)\n", + " result = response.json()\n", + " return result\n", + " except:\n", + " logging.error(\"Error occurred while attempting to retreive aggeregate metrics for the run\")\n", + " raise\n", + "\n", + "runs_data = retrieve_aggregate_metrics(execution_id)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metrics_nameBERT Score/F1/meanBERT Score/F1/medianBERT Score/F1/p90BERT Score/F1/p95BERT Score/F1/stddevBERT Score/Precision/meanBERT Score/Precision/medianBERT Score/Precision/p90BERT Score/Precision/p95BERT Score/Precision/stddevBERT Score/Recall/meanBERT Score/Recall/medianBERT Score/Recall/p90BERT Score/Recall/p95BERT Score/Recall/stddevPointwise Conciseness/1/countPointwise Conciseness/2/countPointwise Conciseness/3/countPointwise Conciseness/4/countPointwise Conciseness/5/countPointwise Conciseness/meanPointwise Conciseness/medianPointwise Conciseness/p90Pointwise Conciseness/p95Pointwise Conciseness/stddevcompletion_tokens/sumgroundedness/0/countgroundedness/1/countgroundedness/2/countgroundedness/3/countgroundedness/4/countgroundedness/5/countlatency/averageprompt_tokens/sumsubmission/sum
model
gpt-4o0.4665450.4869340.5501160.5614220.0772350.4145020.4457250.4984970.5051580.0810790.5394360.5481890.638810.6629220.0810680.00.013.022.014.04.0204084.05.05.00.74971619553.00.00.00.00.00.00.0105.2847391608.049.0
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from IPython.display import HTML\n", + "\n", + "def get_model_from_run(run):\n", + " for tag in run.get(\"tags\", []):\n", + " if tag.get(\"name\") == \"evaluation.ai.sap.com/model\":\n", + " return tag.get(\"value\")\n", + "\n", + "def aggregate_metrics_by_model(runs_list):\n", + " transformed_data = []\n", + " for run in runs_list:\n", + " model = get_model_from_run(run)\n", + " for metric in run[\"metrics\"]:\n", + " output_json = {\n", + " \"model\": model,\n", + " \"metrics_name\": metric.get(\"name\"),\n", + " \"metric_value\": metric.get(\"value\")\n", + " }\n", + " transformed_data.append(output_json)\n", + " return transformed_data\n", + "\n", + "\n", + "def create_metrics_pivot_table(transformed_data):\n", + " \"\"\"\n", + " Creates a pivot table where rows are models and columns are metrics.\n", + " \n", + " Args:\n", + " transformed_data: List of dictionaries with 'model', 'metrics_name', 'metric_value'\n", + " \n", + " Returns:\n", + " DataFrame with models as rows and metrics as columns\n", + " \"\"\"\n", + " # Convert list of dictionaries to DataFrame\n", + " df = pd.DataFrame(transformed_data)\n", + " \n", + " # Create pivot table\n", + " pivot_table = df.pivot_table(\n", + " index='model',\n", + " columns='metrics_name',\n", + " values='metric_value',\n", + " aggfunc='first' # Use 'first' to get the single value, or 'mean' if there are duplicates\n", + " )\n", + " \n", + " return pivot_table\n", + "\n", + "transformed_data = aggregate_metrics_by_model(runs_data['resources'])\n", + "metrics_pivot = create_metrics_pivot_table(transformed_data)\n", + "\n", + "HTML(metrics_pivot.to_html())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process Your Execution Results\n", + "The next steps will take your evaluation run and process it such that it will give you a point based ranking of which particular model did the best based on the metrics you selected" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BERT Score_scorePointwise Conciseness_score
model
gpt-4o-0.0530110.510204
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modeltotal_scorerank
0gpt-4o0.2285961
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from IPython.display import HTML\n", + "\n", + "# Scoring logic depends on \"scoring_type\"\n", + "# \"weight\" represents the relative weight of this metric to all SELECTED metrics\n", + "METRICS_SCORING_TYPE_MAPPING = {\n", + " \"Content Filter on Input\": {\n", + " \"scoring_type\": \"bool-false\", # False is good\n", + " \"weight\": 1\n", + " },\n", + " \"Content Filter on Output\": {\n", + " \"scoring_type\": \"bool-false\", # False is good\n", + " \"weight\": 1\n", + " },\n", + " \"Pointwise Instruction Following\": {\n", + " \"scoring_type\": \"num_1_to_5\",\n", + " \"weight\": 1\n", + " },\n", + " \"Pointwise Answer Relevance\": {\n", + " \"scoring_type\": \"num_1_to_5\",\n", + " \"weight\": 1\n", + " },\n", + " \"Pointwise Conciseness\": {\n", + " \"scoring_type\": \"num_1_to_5\",\n", + " \"weight\": 1\n", + " },\n", + " \"Pointwise Correctness\": {\n", + " \"scoring_type\": \"num_1_to_5\",\n", + " \"weight\": 1\n", + " },\n", + " \"BLEU\": {\n", + " \"scoring_type\": \"num_0_to_1\",\n", + " \"weight\": 1\n", + " },\n", + " \"ROUGE\": {\n", + " \"scoring_type\": \"num_0_to_1\",\n", + " \"weight\": 1\n", + " },\n", + " \"BERT Score\": {\n", + " \"scoring_type\": \"F1/Precision/Recall num_0_to_1\",\n", + " \"weight\": 1\n", + " }\n", + "}\n", + "\n", + "def calculate_bool_metric_score(pivot_df, metric_base_name, true_is_good):\n", + " \"\"\"\n", + " Calculate scores for boolean metrics based on False/True counts.\n", + " \n", + " Args:\n", + " pivot_df: DataFrame with models as rows and metrics as columns\n", + " metric_base_name: Base name of the metric (without /False/count or /True/count)\n", + " true_is_good: Boolean indicating if True is considered a good outcome\n", + " \n", + " Returns:\n", + " Series with boolean metric scores per model (scaled to -1 to 1)\n", + " \"\"\"\n", + " false_col = f\"{metric_base_name}/False/count\"\n", + " true_col = f\"{metric_base_name}/True/count\"\n", + " \n", + " false_values = pivot_df[false_col] if false_col in pivot_df.columns else 0\n", + " true_values = pivot_df[true_col] if true_col in pivot_df.columns else 0\n", + " total_values = true_values + false_values\n", + "\n", + " score = ((false_values * 1) + (true_values * -1)) / total_values\n", + "\n", + " if true_is_good:\n", + " score = 0 - score\n", + "\n", + " return score\n", + "\n", + "def calculate_numeric_metric_score(pivot_df, metric_base_name, range_min=0, range_max=1):\n", + " \"\"\"\n", + " Calculate scores for numeric metrics with /mean\n", + " The mean is normalized to a score between -1 and 1 using the provided range.\n", + " \n", + " Args:\n", + " pivot_df: DataFrame with models as rows and metrics as columns\n", + " metric_base_name: Base name of the metric (without suffixes)\n", + " range_min: Minimum possible value of the metric\n", + " range_max: Maximum possible value of the metric\n", + " \n", + " Returns:\n", + " Series with numeric metric scores per model (scaled to -1 to 1)\n", + " \"\"\"\n", + " mean_col = f\"{metric_base_name}/mean\"\n", + " \n", + " if mean_col not in pivot_df.columns:\n", + " return pd.Series(0.0, index=pivot_df.index)\n", + " \n", + " mean_values = pivot_df[mean_col]\n", + " \n", + " # Linear normalization from [range_min, range_max] to [0, 1]\n", + " normalized = (mean_values - range_min) / (range_max - range_min)\n", + " \n", + " # Scale to [-1, 1]\n", + " score = (normalized * 2) - 1\n", + " \n", + " return score\n", + "\n", + "def calculate_bert_score(pivot_df, metric_base_name):\n", + " \"\"\"\n", + " Calculate BERT Score by averaging F1, Precision, and Recall scores.\n", + " \n", + " Args:\n", + " pivot_df: DataFrame with models as rows and metrics as columns\n", + " metric_base_name: Base name \"BERT Score\"\n", + " \n", + " Returns:\n", + " Series with BERT scores per model (scaled to -1 to 1)\n", + " \"\"\"\n", + " f1_col = f\"{metric_base_name}/F1/mean\"\n", + " precision_col = f\"{metric_base_name}/Precision/mean\"\n", + " recall_col = f\"{metric_base_name}/Recall/mean\"\n", + " \n", + " scores = []\n", + " for col in [f1_col, precision_col, recall_col]:\n", + " if col in pivot_df.columns:\n", + " scores.append(pivot_df[col])\n", + " \n", + " if not scores:\n", + " return pd.Series(0.0, index=pivot_df.index)\n", + " \n", + " # Average the three metrics (already in 0 to 1 range)\n", + " avg_score = sum(scores) / len(scores)\n", + " \n", + " # Scale to [-1, 1]\n", + " score = (avg_score * 2) - 1\n", + " \n", + " return score\n", + "\n", + "def find_unique_metrics_in_pivot(pivot_df):\n", + " \"\"\"\n", + " Identify unique metric base names present in the pivot table.\n", + " \n", + " Args:\n", + " pivot_df: DataFrame with models as rows and metrics as columns\n", + " \"\"\"\n", + " # Extract unique metric names from pivot table columns\n", + " unique_metrics = set()\n", + " for col in pivot_df.columns:\n", + " # Extract base metric name by removing suffixes\n", + " base_name = col\n", + " for suffix in ['/False/count', '/True/count', '/F1_score/mean','/Precision_score/mean', \n", + " '/Recall_score/mean','/mean','/median', '/p90', '/p95', '/stddev']:\n", + " if suffix in base_name and \"BERT Score\" not in base_name:\n", + " base_name = base_name.replace(suffix, '')\n", + " unique_metrics.add(base_name)\n", + " break\n", + " if base_name.startswith(\"BERT Score/\"):\n", + " base_name = \"BERT Score\"\n", + " unique_metrics.add(base_name)\n", + " if not unique_metrics:\n", + " raise ValueError(\"No valid metrics found in pivot table\")\n", + " return unique_metrics\n", + "\n", + "\n", + "def rank_models(pivot_df, unique_metrics=None):\n", + " \"\"\"\n", + " Rank models based on metrics present in the pivot table.\n", + " \n", + " Args:\n", + " pivot_df: DataFrame with models as rows (index) and metrics as columns\n", + " \n", + " Returns:\n", + " DataFrame with model rankings and scores\n", + " \"\"\" \n", + " # Calculate total weight for metrics present in pivot table\n", + " total_weight = sum(METRICS_SCORING_TYPE_MAPPING[m][\"weight\"] for m in unique_metrics)\n", + " \n", + " # Initialize total score\n", + " total_scores = pd.Series(0.0, index=pivot_df.index)\n", + " \n", + " # Process each metric found in the pivot table\n", + " for metric_name in unique_metrics:\n", + " config = METRICS_SCORING_TYPE_MAPPING[metric_name]\n", + " scoring_type = config[\"scoring_type\"]\n", + " weight = config[\"weight\"] / total_weight\n", + " \n", + " if scoring_type == \"bool-false\":\n", + " # False is good (True is bad)\n", + " metric_score = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=False)\n", + " total_scores += metric_score * weight\n", + " \n", + " elif scoring_type == \"bool-true\":\n", + " # True is good (False is bad)\n", + " metric_score = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=True)\n", + " total_scores += metric_score * weight\n", + " \n", + " elif scoring_type == \"num_1_to_5\":\n", + " metric_score = calculate_numeric_metric_score(pivot_df, metric_name, range_min=1, range_max=5)\n", + " total_scores += metric_score * weight\n", + " \n", + " elif scoring_type == \"num_0_to_1\":\n", + " metric_score = calculate_numeric_metric_score(pivot_df, metric_name, range_min=0, range_max=1)\n", + " total_scores += metric_score * weight\n", + " \n", + " elif scoring_type == \"F1/Precision/Recall num_0_to_1\":\n", + " # BERT Score\n", + " metric_score = calculate_bert_score(pivot_df, metric_name)\n", + " total_scores += metric_score * weight\n", + " \n", + " # Create results DataFrame\n", + " results_df = pd.DataFrame({\n", + " 'model': pivot_df.index,\n", + " 'total_score': total_scores.values\n", + " })\n", + " \n", + " # Rank models (higher score = better rank)\n", + " results_df['rank'] = results_df['total_score'].rank(ascending=False, method='min').astype(int)\n", + " results_df = results_df.sort_values('rank')\n", + " \n", + " return results_df\n", + "\n", + "def get_detailed_scores(pivot_df, unique_metrics):\n", + " \"\"\"\n", + " Get detailed breakdown of scores per metric for each model.\n", + " \n", + " Args:\n", + " pivot_df: DataFrame with models as rows and metrics as columns\n", + " \n", + " Returns:\n", + " DataFrame with detailed scores per metric\n", + " \"\"\"\n", + " detailed_scores = pd.DataFrame(index=pivot_df.index)\n", + " \n", + " # Process each metric in the mapping\n", + " for metric_name in unique_metrics:\n", + " scoring_type = METRICS_SCORING_TYPE_MAPPING[metric_name][\"scoring_type\"]\n", + " \n", + " if scoring_type == \"bool-false\":\n", + " detailed_scores[f\"{metric_name}_score\"] = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=False)\n", + " \n", + " elif scoring_type == \"bool-true\":\n", + " detailed_scores[f\"{metric_name}_score\"] = calculate_bool_metric_score(pivot_df, metric_name, true_is_good=True)\n", + " \n", + " elif scoring_type == \"num_1_to_5\":\n", + " detailed_scores[f\"{metric_name}_score\"] = calculate_numeric_metric_score(pivot_df, metric_name, range_min=1, range_max=5)\n", + " \n", + " elif scoring_type == \"num_0_to_1\":\n", + " detailed_scores[f\"{metric_name}_score\"] = calculate_numeric_metric_score(pivot_df, metric_name, range_min=0, range_max=1)\n", + " \n", + " elif scoring_type == \"F1/Precision/Recall num_0_to_1\":\n", + " detailed_scores[f\"{metric_name}_score\"] = calculate_bert_score(pivot_df, metric_name)\n", + " \n", + " return detailed_scores\n", + "\n", + "unique_metrics = find_unique_metrics_in_pivot(metrics_pivot)\n", + "\n", + "# Get detailed scores breakdown\n", + "detailed = get_detailed_scores(metrics_pivot, unique_metrics)\n", + "display(HTML(detailed.to_html()))\n", + "\n", + "# Rank models\n", + "ranking = rank_models(metrics_pivot, unique_metrics)\n", + "display(HTML(ranking.to_html()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To further drill down , User can also download the SQLite DB file from object storage and analyse the results(instance level metrics, logs etc) locally." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading e8d0adb660289477/evaluation_result/results.db to results-new\\results.db\n" + ] + } + ], + "source": [ + "# download the result artifacts from Object store.\n", + "import boto3\n", + "\n", + "def download_all_objects(prefix, destination_folder):\n", + " \"\"\"\n", + " Recursively download all objects from an S3 bucket starting with a specific prefix.\n", + "\n", + " :param bucket_name: Name of the S3 bucket.\n", + " :param prefix: Prefix to filter objects in the bucket.\n", + " :param destination_folder: Local folder to save the downloaded files.\n", + " \"\"\"\n", + " s3_client = boto3.client(\n", + " 's3',\n", + " aws_access_key_id=AWS_ACCESS_KEY,\n", + " aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n", + " region_name=AWS_REGION\n", + " )\n", + "\n", + " # Ensure the destination folder exists\n", + " if not os.path.exists(destination_folder):\n", + " os.makedirs(destination_folder)\n", + "\n", + " # Paginate through objects\n", + " paginator = s3_client.get_paginator('list_objects_v2')\n", + " pages = paginator.paginate(Bucket=AWS_BUCKET_ID, Prefix=prefix)\n", + "\n", + " for page in pages:\n", + " if 'Contents' in page:\n", + " for obj in page['Contents']:\n", + " key = obj['Key']\n", + " local_file_path = os.path.join(destination_folder, os.path.relpath(key, prefix))\n", + "\n", + " # Ensure the local directory structure exists\n", + " local_directory = os.path.dirname(local_file_path)\n", + " if not os.path.exists(local_directory):\n", + " os.makedirs(local_directory)\n", + "\n", + " # Download the object\n", + " print(f\"Downloading {key} to {local_file_path}\")\n", + " s3_client.download_file(AWS_BUCKET_ID, key, local_file_path)\n", + "\n", + "\n", + "# Download the evaluation results from the object store. Look at execution status under \"outputArtifacts\" key to see the 'url'\n", + "# which shows the data path of where your output results are stored\n", + "EXECUTION_ID = execution_id\n", + "sqlite_db_prefix = f'{EXECUTION_ID}/evaluation_result/' # change the prefix based on where your output artifact is stored in the bucket.\n", + "destination_folder = 'results-new'\n", + "\n", + "download_all_objects(sqlite_db_prefix, destination_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NOTE: The below Cell shows results of top 10 rows of the Evaluation Results across all SQLite tables. IF you wish to see all the entries you can comment the line saying df.head(10) in the below cell or modify the number accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "\n", + "
\n", + "

Table: run

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameconfigtagscreated_atupdated_at
960997700d79409680974700c0d67825Run-genai-eval-test-gpt-4o-2024-08-06{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{}2025-11-14 05:38:21.9641282025-11-14 05:38:21.964132
\n", + "
\n", + " \n", + "
\n", + "

Table: configuration

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtest_datasetsmetricsvariable_mappingtagsorchestration_deployment_urlrepetitionsmetric_templatescreated_atupdated_at
076096962816435e8e03c0aed7fa4743{\"path\": \"testdata/medicalqna_dataset.csv\", \"type\": \"csv\"}[\"BERT Score\", \"Pointwise Conciseness\", \"2b3cc135-a031-4d93-8641-1f3833797034\", \"9b349f8e-cd39-486d-809c-3dcfa3b15ac7\"]{}{}https://api.ai.internalprod.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/d92895b42bdd61751[{\"evaluationMethod\": \"computed\", \"scenario\": \"genai-evaluations\", \"createdAt\": \"0001-01-01 00:00:00+00:00\", \"managedBy\": \"imperative\", \"metricType\": \"evaluation\", \"systemPredefined\": true, \"id\": \"93a16045-d577-4132-8481-9497cb205961\", \"name\": \"BERT Score\", \"description\": \"Bertscore is a metric for evaluating the quality of text generation by comparing it to reference texts. It leverages BERT, a pre-trained transformer model, to compute contextual embeddings for each token in both the candidate and reference sentences.\", \"version\": \"1.0.0\", \"includeProperties\": [\"reference\"], \"additionalProperties\": {\"variables\": [], \"output_type\": \"numerical\", \"supported_values\": [0, 1], \"experimental\": false}}, {\"evaluationMethod\": \"llm-as-a-judge\", \"scenario\": \"genai-evaluations\", \"createdAt\": \"0001-01-01 00:00:00+00:00\", \"managedBy\": \"imperative\", \"metricType\": \"evaluation\", \"systemPredefined\": true, \"id\": \"95c03e1b-3938-42dd-bc69-3ec5cd0e5e18\", \"name\": \"Pointwise Conciseness\", \"description\": \"Assess the model's response is a short and concise answer to user prompt.\", \"version\": \"1.0.0\", \"spec\": {\"promptType\": \"structured\", \"configuration\": {\"modelConfiguration\": {\"name\": \"gpt-4.1\", \"version\": \"2025-04-14\", \"parameters\": [{\"key\": \"temperature\", \"value\": \"0\"}]}, \"promptConfiguration\": {\"evaluationTask\": \"You are an expert evaluator. Your task is to evaluate the conciseness of responses generated by AI models.\\nWe will provide you with the user input and an AI-generated response.\\nYou should first read the user input carefully to understand the context and intention, and then evaluate the conciseness of the response based on the criteria provided in the Evaluation section below.\\nYou will assign the response a rating following the Rating Rubric and Evaluation Steps.\\nGive step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.\", \"definition\": \"You will be assessing conciseness, which measures the ability to convey the necessary information in a clear and succinct manner.\", \"criteria\": \"Conciseness: Does the response deliver the essential information without unnecessary words or redundancy?\", \"ratingRubric\": [{\"rating\": 1, \"rule\": \"(Not concise). The response is not concise and is filled with unnecessary or redundant content that obscures the main points.\"}, {\"rating\": 2, \"rule\": \"(Slightly concise). The response is slightly concise and contains a significant amount of unnecessary or redundant information.\"}, {\"rating\": 3, \"rule\": \"(Somewhat concise). The response is somewhat concise but may include some unnecessary words or slightly redundant information.\"}, {\"rating\": 4, \"rule\": \"(Mostly concise). The response is mostly concise and generally avoids unnecessary words while covering the essential information.\"}, {\"rating\": 5, \"rule\": \"(Highly concise). The response is very concise, delivering all necessary information in a succinct manner without any superfluous content.\"}], \"evaluationSteps\": [\"Assess the response in terms of Conciseness. Identify how effectively the response communicates essential information without unnecessary words according to the Criteria.\", \"Score based on the rating rubric. Give a brief rationale to explain your evaluation considering Conciseness.\"]}}}, \"additionalProperties\": {\"variables\": [], \"output_type\": \"numerical\", \"supported_values\": [1, 5], \"experimental\": true}}, {\"id\": \"2b3cc135-a031-4d93-8641-1f3833797034\", \"createdAt\": \"2025-11-14 05:10:58.078072\", \"name\": \"groundedness\", \"scenario\": \"genai-evaluations\", \"version\": \"0.0.1\", \"evaluationMethod\": \"llm-as-a-judge\", \"managedBy\": \"imperative\", \"systemPredefined\": false, \"metricType\": \"evaluation\", \"spec\": {\"promptType\": \"free-form\", \"configuration\": {\"modelConfiguration\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"parameters\": [{\"key\": \"temperature\", \"value\": \"0.1\"}, {\"key\": \"max_tokens\", \"value\": \"110\"}]}, \"promptConfiguration\": {\"systemPrompt\": \"You should strictly follow the instruction given to you. Please act as an impartial judge and evaluate the quality of the responses based on the prompt and following criteria:\", \"userPrompt\": \"You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with a reference and an AI-generated response. You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the criteria provided in the Evaluation section below. You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.\\n\\n## Metric Definition\\nYou are an INFORMATION OVERLAP classifier providing the overlap of information between a response and reference.\\n\\n## Criteria\\nGroundedness: The of information between a response generated by AI models and provided reference.\\n\\n## Rating Rubric\\n5: (Fully grounded). The response and the reference are fully overlapped.\\n4: (Mostly grounded). The response and the reference are mostly overlapped.\\n3: (Somewhat grounded). The response and the reference are somewhat overlapped.\\n2: (Poorly grounded). The response and the reference are slightly overlapped.\\n1: (Not grounded). There is no overlap between the response and the reference.\\n\\n## Evaluation Steps\\nSTEP 1: Assess the response in aspects of Groundedness. Identify any information in the response and provide assessment according to the Criteria.\\nSTEP 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering Groundedness.\\n\\nReference: {{?reference}}\\nResponse: {{?aicore_llm_completion}}\\n\\nBegin your evaluation by providing a short explanation. Be as unbiased as possible. After providing your explanation, please rate the response according to the rubric and outputs STRICTLY following this JSON format:\\n\\n{ \\\"explanation\\\": string, \\\"rating\\\": integer }\\n\\nOutput:\\n\", \"dataType\": \"numeric\"}}}}, {\"id\": \"9b349f8e-cd39-486d-809c-3dcfa3b15ac7\", \"createdAt\": \"2025-11-14 05:11:00.137979\", \"name\": \"groundedness\", \"scenario\": \"genai-evaluations\", \"version\": \"0.1.6\", \"evaluationMethod\": \"llm-as-a-judge\", \"managedBy\": \"imperative\", \"systemPredefined\": false, \"metricType\": \"evaluation\", \"spec\": {\"promptType\": \"free-form\", \"configuration\": {\"modelConfiguration\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"parameters\": [{\"key\": \"temperature\", \"value\": \"0.1\"}, {\"key\": \"max_tokens\", \"value\": \"110\"}]}, \"promptConfiguration\": {\"systemPrompt\": \"You should strictly follow the instruction given to you. Please act as an impartial judge and evaluate the quality of the responses based on the prompt and following criteria:\", \"userPrompt\": \"You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with a reference and an AI-generated response. You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the criteria provided in the Evaluation section below. You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.\\n\\n## Metric Definition\\nYou are an INFORMATION OVERLAP classifier providing the overlap of information between a response and reference.\\n\\n## Criteria\\nGroundedness: The of information between a response generated by AI models and provided reference.\\n\\n## Rating Rubric\\n5: (Fully grounded). The response and the reference are fully overlapped.\\n4: (Mostly grounded). The response and the reference are mostly overlapped.\\n3: (Somewhat grounded). The response and the reference are somewhat overlapped.\\n2: (Poorly grounded). The response and the reference are slightly overlapped.\\n1: (Not grounded). There is no overlap between the response and the reference.\\n\\n## Evaluation Steps\\nSTEP 1: Assess the response in aspects of Groundedness. Identify any information in the response and provide assessment according to the Criteria.\\nSTEP 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering Groundedness.\\n\\nReference: {{?reference}}\\nResponse: {{?aicore_llm_completion}}\\n\\nBegin your evaluation by providing a short explanation. Be as unbiased as possible. After providing your explanation, please rate the response according to the rubric and outputs STRICTLY following this JSON format:\\n\\n{ \\\"explanation\\\": string, \\\"rating\\\": integer }\\n\\nOutput:\\n\", \"dataType\": \"numeric\"}}}}]2025-11-14 05:38:21.9548642025-11-14 05:38:21.954869
\n", + "
\n", + " \n", + "
\n", + "

Table: submission

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idrun_idorchestration_configurationtemplate_variablescreated_atupdated_at
f1132352cb524392b733ca0773c6bef0960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"how does rivatigmine and otc sleep medicine interact\", \"sentiment\": \"Interaction\", \"reference\": \"tell your doctor and pharmacist what prescription and nonprescription medications, vitamins, nutritional supplements, and herbal products you are taking or plan to take. Be sure to mention any of the following: antihistamines; aspirin and other nonsteroidal anti-inflammatory medications (NSAIDs) such as ibuprofen (Advil, Motrin) and naproxen (Aleve, Naprosyn); bethanechol (Duvoid, Urecholine); ipratropium (Atrovent, in Combivent, DuoNeb); and medications for Alzheimer's disease, glaucoma, irritable bowel disease, motion sickness, ulcers, or urinary problems. Your doctor may need to change the doses of your medications or monitor you carefully for side effects.\"}2025-11-14 05:38:21.9750022025-11-14 05:38:21.975004
ce3fd3f1f8f04499847153fb1d3a2b0f960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"how does valium affect the brain\", \"sentiment\": \"Action\", \"reference\": \"Diazepam is a benzodiazepine that exerts anxiolytic, sedative, muscle-relaxant, anticonvulsant and amnestic effects. Most of these effects are thought to result from a facilitation of the action of gamma aminobutyric acid (GABA), an inhibitory neurotransmitter in the central nervous system.\"}2025-11-14 05:38:21.9750092025-11-14 05:38:21.975010
e0a2c70183dd4ce289772b5ae74957b6960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"what is morphine\", \"sentiment\": \"Information\", \"reference\": \"Morphine is a pain medication of the opiate family which is found naturally in a number of plants and animals.[5][7] It acts directly on the central nervous system (CNS) to decrease the feeling of pain.\"}2025-11-14 05:38:21.9750142025-11-14 05:38:21.975015
61ca98967e5b45f4865db10d81de298b960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"what are the milligrams for oxycodone e\", \"sentiment\": \"Dose\", \"reference\": \"\\ufffd 10 mg \\ufffd 20 mg \\ufffd 40 mg \\ufffd 80 mg ...\"}2025-11-14 05:38:21.9750182025-11-14 05:38:21.975019
9f84346869014412a1fd55ad52bd23b1960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"81% aspirin contain resin and shellac in it. ?\", \"sentiment\": \"Ingredient\", \"reference\": \"Inactive Ingredients Ingredient Name\"}2025-11-14 05:38:21.9750222025-11-14 05:38:21.975023
98a14a854ba94f09b7a7b48c30878f3b960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"what is desonide ointment used for\", \"sentiment\": \"Indication\", \"reference\": \"Desonide is used to treat the redness, swelling, itching, and discomfort of various skin conditions, including psoriasis (a skin disease in which red, scaly patches form on some areas of the body and eczema (a skin disease that causes the skin to be dry and itchy and to sometimes develop red, scaly rashes).\"}2025-11-14 05:38:21.9750272025-11-14 05:38:21.975027
6c34dd7a4bba4f4e8bd9bff5ab9afa1f960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"how soon can tylenol be taken after a cocktail?\", \"sentiment\": \"Interaction\", \"reference\": \"According to the National Health Service (NHS) in the UK, it is usually safe to drink a small amount of alcohol while taking this pain reliever. ... However, when people take acetaminophen at high doses or together with alcohol, it can cause side effects ranging from minor to severe, with the possibility of fatal liver damage. This risk may be higher for people with alcohol use disorder (AUD), which was previously known as alcoholism.... According to the U.S. National Library of Medicine, taking acetaminophen can be dangerous for people who regularly drink alcohol. Manufacturers currently recommend that people who have more than 3 alcoholic drinks per day should ask their doctor before taking acetaminophen.\"}2025-11-14 05:38:21.9750322025-11-14 05:38:21.975032
b8c446bc2c2c423e9d60b1e7511f5c36960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"breo inhaler how it works\", \"sentiment\": \"Action\", \"reference\": \"The combination of fluticasone and vilanterol is used to control wheezing, shortness of breath, coughing, and chest tightness caused by asthma and chronic obstructive pulmonary (COPD; a group of diseases that affect the lungs and airways, that includes chronic bronchitis and emphysema). Fluticasone is in a class of medications called steroids. It works by reducing swelling in the airways. Vilanterol is in a class of medications called long-acting beta-agonists (LABAs). It works by relaxing and opening air passages in the lungs, making it easier to breathe.\"}2025-11-14 05:38:21.9750362025-11-14 05:38:21.975036
4b6e2f1ddefa4f1585977e1cbe2fa900960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"breo inhaler how it works\", \"sentiment\": \"Usage\", \"reference\": \"To use the inhaler, follow these steps:\\n\\t1\\tIf you will be using a new inhaler for the first time, remove it from the box and the foil wrapper. Fill in the \\\"Tray opened\\\" and \\\"Discard\\\" blanks on the inhaler label with the date that you opened the pouch and the date 6 weeks later when you must replace the inhaler.\\n\\t2\\tWhen you are ready to inhale your dose, slide the cover down to expose the mouthpiece until it clicks. If you open and close the inhaler without using your dose, you will waste the medication.\\n\\t3\\tThe counter will count down by 1 each time you open the cover. If the counter does not count down, your inhaler will not provide the medicine. If your inhaler does not count down, call your pharmacist or doctor.\\n\\t4\\tHold the inhaler away from your mouth and breathe out as far as you comfortably can. Do not breathe out into the mouthpiece.\\n\\t5\\tPut the mouthpiece between your lips, and close your lips firmly around it. Take a long, steady, deep breath in through your mouth. Do not breathe in through your nose. Be careful not block the air vent with your fingers.\\n\\t6\\tRemove the inhaler from your mouth, and hold your breath for about 3 to 4 seconds or as long as you comfortably can. Breathe out slowly.\\n\\t7\\tYou may or may not taste or feel the medicine released by the inhaler. Even if you do not, do not inhale another dose. If you are not sure you are getting your dose of fluticasone and vilanterol, call your doctor or pharmacist.\\n\\t8\\tYou may clean the mouthpiece with a dry tissue, if needed. Slide the cover up over the mouthpiece as far as it will go to close the inhaler.\\n\\t9\\tRinse your mouth with water, but do not swallow.\\nAsk your pharmacist or doctor for a copy of the manufacturer's information for the patient.\"}2025-11-14 05:38:21.9750402025-11-14 05:38:21.975040
ac009b3ba51c470ea3d672e1dc0fc290960997700d79409680974700c0d67825{\"modules\": {\"prompt_templating\": {\"prompt\": {\"template\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: {{?question}}.\", \"role\": \"user\"}]}, \"model\": {\"name\": \"gpt-4o\", \"version\": \"2024-08-06\", \"timeout\": 600, \"max_retries\": 2}}}}{\"question\": \"qvar 40mg what is it for\", \"sentiment\": \"Indication\", \"reference\": \"QVAR is indicated in the maintenance treatment of asthma as prophylactic therapy in patients 5 years of age and older. QVAR is also indicated for asthma patients who require systemic corticosteroid administration, where adding QVAR may reduce or eliminate the need for the systemic corticosteroids.\"}2025-11-14 05:38:21.9750442025-11-14 05:38:21.975045
\n", + "
\n", + " \n", + "
\n", + "

Table: submission_result

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
submission_idrun_idrepetition_countcompletion_resultlatencycreated_atupdated_at
f1132352cb524392b733ca0773c6bef0960997700d79409680974700c0d678251{\"request_id\": \"a84d63c1-35ad-9e6e-aa8d-bc27f6c22e5e\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: how does rivatigmine and otc sleep medicine interact.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpKlbqIxb3V5NHK52VUybEb7B5T\", \"object\": \"chat.completion\", \"created\": 1763098798, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Rivastigmine is a pharmaceutical drug primarily used for the treatment of mild to moderate dementia associated with Alzheimer's and Parkinson's disease. It belongs to a class of drugs known as cholinesterase inhibitors. Over-the-counter (OTC) sleep medicines often contain antihistamines, such as diphenhydramine or doxylamine, which can cause sedation.\\n\\n**Benefits of Rivastigmine:**\\n- Improvement in cognitive function in patients with Alzheimer's and Parkinson's disease-related dementia.\\n- May help with daily living activities by slowing the progression of symptoms.\\n- Possible benefits in improving attention and memory.\\n\\n**Side Effects of Rivastigmine:**\\n- Common side effects include nausea, vomiting, and diarrhea.\\n- Loss of appetite and weight loss.\\n- Dizziness or headache.\\n- Excessive sweating.\\n- More severe side effects can include heart problems such as bradycardia, peptic ulcers, or exacerbation of respiratory conditions.\\n\\n**Interactions between Rivastigmine and OTC Sleep Medicine:**\\n- **Enhanced Sedation:** Rivastigmine can enhance the sedative effects of antihistamines found in OTC sleep aids, leading to increased drowsiness or dizziness.\\n- **Cognitive Effects:** The combination could lead to worsened cognitive impairment in patients already dealing with cognitive decline, due to the sedative effects of sleep medicines.\\n- **Anticholinergic Effects:** Many OTC sleep aids have anticholinergic properties, which can counteract the effects of cholinesterase inhibitors like rivastigmine, potentially reducing its efficacy.\\n- **Risk of Falls:** Increased sedation and dizziness can raise the risk of falls, especially in elderly patients.\\n\\nIt is important for individuals considering these medications to consult a healthcare provider to carefully evaluate the risks, benefits, and potential interactions specific to their health needs.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 367, \"prompt_tokens\": 34, \"total_tokens\": 401}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpKlbqIxb3V5NHK52VUybEb7B5T\", \"object\": \"chat.completion\", \"created\": 1763098798, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Rivastigmine is a pharmaceutical drug primarily used for the treatment of mild to moderate dementia associated with Alzheimer's and Parkinson's disease. It belongs to a class of drugs known as cholinesterase inhibitors. Over-the-counter (OTC) sleep medicines often contain antihistamines, such as diphenhydramine or doxylamine, which can cause sedation.\\n\\n**Benefits of Rivastigmine:**\\n- Improvement in cognitive function in patients with Alzheimer's and Parkinson's disease-related dementia.\\n- May help with daily living activities by slowing the progression of symptoms.\\n- Possible benefits in improving attention and memory.\\n\\n**Side Effects of Rivastigmine:**\\n- Common side effects include nausea, vomiting, and diarrhea.\\n- Loss of appetite and weight loss.\\n- Dizziness or headache.\\n- Excessive sweating.\\n- More severe side effects can include heart problems such as bradycardia, peptic ulcers, or exacerbation of respiratory conditions.\\n\\n**Interactions between Rivastigmine and OTC Sleep Medicine:**\\n- **Enhanced Sedation:** Rivastigmine can enhance the sedative effects of antihistamines found in OTC sleep aids, leading to increased drowsiness or dizziness.\\n- **Cognitive Effects:** The combination could lead to worsened cognitive impairment in patients already dealing with cognitive decline, due to the sedative effects of sleep medicines.\\n- **Anticholinergic Effects:** Many OTC sleep aids have anticholinergic properties, which can counteract the effects of cholinesterase inhibitors like rivastigmine, potentially reducing its efficacy.\\n- **Risk of Falls:** Increased sedation and dizziness can raise the risk of falls, especially in elderly patients.\\n\\nIt is important for individuals considering these medications to consult a healthcare provider to carefully evaluate the risks, benefits, and potential interactions specific to their health needs.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 367, \"prompt_tokens\": 34, \"total_tokens\": 401}}}9.1126042025-11-14 05:43:14.0161692025-11-14 05:43:14.016173
ce3fd3f1f8f04499847153fb1d3a2b0f960997700d79409680974700c0d678251{\"request_id\": \"0e717c24-58c4-9c75-8b79-487f11da96c3\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: how does valium affect the brain.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpOcNTGm3nqeh2uGJBEijIUDYbC\", \"object\": \"chat.completion\", \"created\": 1763098802, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Valium, also known by its generic name diazepam, is a medication belonging to the benzodiazepine class. It is commonly used to treat anxiety, muscle spasms, and seizures, and it can also be used in certain situations to provide sedation before medical procedures. Here's how Valium affects the brain, including its benefits and potential side effects:\\n\\n### Benefits:\\n1. **Anxiety Relief**: Valium acts on the central nervous system to produce calming effects, thus reducing symptoms of anxiety.\\n2. **Muscle Relaxation**: It can help relieve muscle spasms by promoting relaxation.\\n3. **Seizure Control**: Valium can be used to manage certain types of seizures, especially when used in conjunction with other medications.\\n4. **Sedation**: It may be used to sedate patients before surgery or medical procedures, helping them relax.\\n5. **Alcohol Withdrawal**: Valium is sometimes used to manage symptoms associated with acute alcohol withdrawal.\\n\\n### Side Effects:\\n1. **Drowsiness and Fatigue**: One of the most common side effects, which can affect concentration and coordination.\\n2. **Dizziness**: Users might experience light-headedness or a sensation of spinning.\\n3. **Cognitive Effects**: Can include confusion or forgetfulness.\\n4. **Dependence and Withdrawal**: Long-term use can lead to dependence, with withdrawal symptoms occurring if the medication is abruptly stopped.\\n5. **Coordination Problems**: There may be issues with balance and physical coordination.\\n6. **Blurred Vision**: Some users report changes in their vision.\\n7. **Mood Changes**: Can include feelings of depression or euphoria.\\n8. **Gastrointestinal Issues**: Such as nausea, constipation, or dry mouth.\\n9. **Respiratory Depression**: High doses can depress breathing, especially when combined with other CNS depressants, such as alcohol.\\n\\n### Important Considerations:\\n- **Addiction Risk**: Because it has the potential for abuse and addiction, Valium should be used only as prescribed by a healthcare professional.\\n- **Interactions**: Valium can interact with other medications, including alcohol, further increasing sedative effects.\\n- **Not Suitable for Everyone**: It might not be appropriate for people with certain conditions, such as severe lung conditions, sleep apnea, or a history of substance abuse.\\n\\nAlways follow the advice of a healthcare provider when using Valium, and report any side effects or concerns you have while taking the medication.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 507, \"prompt_tokens\": 31, \"total_tokens\": 538}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpOcNTGm3nqeh2uGJBEijIUDYbC\", \"object\": \"chat.completion\", \"created\": 1763098802, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Valium, also known by its generic name diazepam, is a medication belonging to the benzodiazepine class. It is commonly used to treat anxiety, muscle spasms, and seizures, and it can also be used in certain situations to provide sedation before medical procedures. Here's how Valium affects the brain, including its benefits and potential side effects:\\n\\n### Benefits:\\n1. **Anxiety Relief**: Valium acts on the central nervous system to produce calming effects, thus reducing symptoms of anxiety.\\n2. **Muscle Relaxation**: It can help relieve muscle spasms by promoting relaxation.\\n3. **Seizure Control**: Valium can be used to manage certain types of seizures, especially when used in conjunction with other medications.\\n4. **Sedation**: It may be used to sedate patients before surgery or medical procedures, helping them relax.\\n5. **Alcohol Withdrawal**: Valium is sometimes used to manage symptoms associated with acute alcohol withdrawal.\\n\\n### Side Effects:\\n1. **Drowsiness and Fatigue**: One of the most common side effects, which can affect concentration and coordination.\\n2. **Dizziness**: Users might experience light-headedness or a sensation of spinning.\\n3. **Cognitive Effects**: Can include confusion or forgetfulness.\\n4. **Dependence and Withdrawal**: Long-term use can lead to dependence, with withdrawal symptoms occurring if the medication is abruptly stopped.\\n5. **Coordination Problems**: There may be issues with balance and physical coordination.\\n6. **Blurred Vision**: Some users report changes in their vision.\\n7. **Mood Changes**: Can include feelings of depression or euphoria.\\n8. **Gastrointestinal Issues**: Such as nausea, constipation, or dry mouth.\\n9. **Respiratory Depression**: High doses can depress breathing, especially when combined with other CNS depressants, such as alcohol.\\n\\n### Important Considerations:\\n- **Addiction Risk**: Because it has the potential for abuse and addiction, Valium should be used only as prescribed by a healthcare professional.\\n- **Interactions**: Valium can interact with other medications, including alcohol, further increasing sedative effects.\\n- **Not Suitable for Everyone**: It might not be appropriate for people with certain conditions, such as severe lung conditions, sleep apnea, or a history of substance abuse.\\n\\nAlways follow the advice of a healthcare provider when using Valium, and report any side effects or concerns you have while taking the medication.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 507, \"prompt_tokens\": 31, \"total_tokens\": 538}}}16.7714402025-11-14 05:43:14.0161742025-11-14 05:43:14.016174
e0a2c70183dd4ce289772b5ae74957b6960997700d79409680974700c0d678251{\"request_id\": \"7210615f-2b11-9523-a798-ed1af505e05f\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: what is morphine.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpSRd0RQJNQTjNWQs53chmAduGo\", \"object\": \"chat.completion\", \"created\": 1763098806, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Morphine is a powerful opioid pain medication used to treat moderate to severe pain. It's often used in a hospital setting for pain relief after surgery or for chronic pain management. Here are the benefits and side effects of morphine:\\n\\n### Benefits:\\n1. **Pain Relief**: Morphine is highly effective in reducing severe pain, providing relief for patients who are suffering from acute or chronic pain conditions.\\n2. **Reduced Anxiety and Stress**: By alleviating pain, morphine can also help reduce anxiety and stress associated with severe pain.\\n3. **Sedation**: It may provide a calming effect which can be beneficial for patients in extreme discomfort.\\n4. **Improves Quality of Life**: For those with chronic pain, it can improve daily functioning and overall quality of life.\\n5. **Flexible Administration**: Morphine can be administered in various forms, including orally, intravenously, and via injection, providing flexibility in its use.\\n\\n### Side Effects:\\n1. **Respiratory Depression**: One of the most serious side effects, morphine can slow down breathing, which can be dangerous, requiring careful monitoring especially when first starting the medication or adjusting doses.\\n2. **Nausea and Vomiting**: Common side effects, especially when first starting treatment.\\n3. **Drowsiness and Sedation**: Can limit activities such as driving or operating heavy machinery.\\n4. **Constipation**: A very common issue with opioid use; patients may need to use laxatives or other remedies to counteract this effect.\\n5. **Dizziness**: This may occur, leading to risks of falls, especially in older adults.\\n6. **Tolerance and Dependence**: Long-term use can lead to tolerance (requiring higher doses to achieve the same effect) and physical dependence, meaning that withdrawal symptoms may occur if the medication is abruptly stopped.\\n7. **Potential for Abuse**: As an opioid, morphine has a potential for addiction and abuse, which necessitates careful monitoring and regulation of its use.\\n8. **Allergic Reactions**: Although rare, some individuals may experience allergic reactions such as skin rashes, itching, or more severe reactions.\\n\\nGiven its potent effects and risks, morphine should be used under strict medical supervision to manage its efficacy and safety.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 463, \"prompt_tokens\": 28, \"total_tokens\": 491}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpSRd0RQJNQTjNWQs53chmAduGo\", \"object\": \"chat.completion\", \"created\": 1763098806, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Morphine is a powerful opioid pain medication used to treat moderate to severe pain. It's often used in a hospital setting for pain relief after surgery or for chronic pain management. Here are the benefits and side effects of morphine:\\n\\n### Benefits:\\n1. **Pain Relief**: Morphine is highly effective in reducing severe pain, providing relief for patients who are suffering from acute or chronic pain conditions.\\n2. **Reduced Anxiety and Stress**: By alleviating pain, morphine can also help reduce anxiety and stress associated with severe pain.\\n3. **Sedation**: It may provide a calming effect which can be beneficial for patients in extreme discomfort.\\n4. **Improves Quality of Life**: For those with chronic pain, it can improve daily functioning and overall quality of life.\\n5. **Flexible Administration**: Morphine can be administered in various forms, including orally, intravenously, and via injection, providing flexibility in its use.\\n\\n### Side Effects:\\n1. **Respiratory Depression**: One of the most serious side effects, morphine can slow down breathing, which can be dangerous, requiring careful monitoring especially when first starting the medication or adjusting doses.\\n2. **Nausea and Vomiting**: Common side effects, especially when first starting treatment.\\n3. **Drowsiness and Sedation**: Can limit activities such as driving or operating heavy machinery.\\n4. **Constipation**: A very common issue with opioid use; patients may need to use laxatives or other remedies to counteract this effect.\\n5. **Dizziness**: This may occur, leading to risks of falls, especially in older adults.\\n6. **Tolerance and Dependence**: Long-term use can lead to tolerance (requiring higher doses to achieve the same effect) and physical dependence, meaning that withdrawal symptoms may occur if the medication is abruptly stopped.\\n7. **Potential for Abuse**: As an opioid, morphine has a potential for addiction and abuse, which necessitates careful monitoring and regulation of its use.\\n8. **Allergic Reactions**: Although rare, some individuals may experience allergic reactions such as skin rashes, itching, or more severe reactions.\\n\\nGiven its potent effects and risks, morphine should be used under strict medical supervision to manage its efficacy and safety.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 463, \"prompt_tokens\": 28, \"total_tokens\": 491}}}17.6568122025-11-14 05:43:14.0161752025-11-14 05:43:14.016177
61ca98967e5b45f4865db10d81de298b960997700d79409680974700c0d678251{\"request_id\": \"34e8b9ee-890d-957d-89f4-53dfcad18a20\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: what are the milligrams for oxycodone e.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpWCiud9mjIG1PkXPDcHc9HFIrQ\", \"object\": \"chat.completion\", \"created\": 1763098810, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Oxycodone is a prescription opioid medication used to treat moderate to severe pain. It is important to use this medication exactly as prescribed by a healthcare provider. Here are some of the benefits and side effects of oxycodone:\\n\\n### Benefits\\n1. **Pain Relief**: Oxycodone is effective in reducing or eliminating acute and chronic pain.\\n2. **Improved Quality of Life**: For individuals with chronic pain, oxycodone can improve the ability to perform daily activities and overall quality of life.\\n3. **Various Formulations**: Available in immediate-release and extended-release formulations for flexible pain management.\\n\\n### Side Effects\\nCommon side effects include:\\n1. **Nausea and Vomiting**: These can occur when starting the medication or adjusting the dose.\\n2. **Constipation**: A very common side effect that often requires dietary changes or laxatives to manage.\\n3. **Drowsiness and Dizziness**: May impair the ability to drive or operate machinery.\\n4. **Sweating**: Increased sweating is a possible side effect.\\n5. **Dry Mouth**: Can occur with opioid use.\\n\\nSerious side effects may include:\\n1. **Respiratory Depression**: Slowed breathing can be life-threatening and requires immediate medical attention.\\n2. **Addiction, Abuse, and Misuse**: Oxycodone has a high potential for addiction.\\n3. **Withdrawal Symptoms**: These can occur if the medication is stopped abruptly after prolonged use.\\n4. **Interactions with Other Medications**: Can have harmful interactions with other drugs, especially other CNS depressants.\\n5. **Severe Allergic Reactions**: Symptoms such as rash, itching, swelling, severe dizziness, or trouble breathing require immediate medical attention.\\n\\nAlways consult a healthcare provider for personalized advice and to discuss the risks and benefits before starting any new medication.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 379, \"prompt_tokens\": 34, \"total_tokens\": 413}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpWCiud9mjIG1PkXPDcHc9HFIrQ\", \"object\": \"chat.completion\", \"created\": 1763098810, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Oxycodone is a prescription opioid medication used to treat moderate to severe pain. It is important to use this medication exactly as prescribed by a healthcare provider. Here are some of the benefits and side effects of oxycodone:\\n\\n### Benefits\\n1. **Pain Relief**: Oxycodone is effective in reducing or eliminating acute and chronic pain.\\n2. **Improved Quality of Life**: For individuals with chronic pain, oxycodone can improve the ability to perform daily activities and overall quality of life.\\n3. **Various Formulations**: Available in immediate-release and extended-release formulations for flexible pain management.\\n\\n### Side Effects\\nCommon side effects include:\\n1. **Nausea and Vomiting**: These can occur when starting the medication or adjusting the dose.\\n2. **Constipation**: A very common side effect that often requires dietary changes or laxatives to manage.\\n3. **Drowsiness and Dizziness**: May impair the ability to drive or operate machinery.\\n4. **Sweating**: Increased sweating is a possible side effect.\\n5. **Dry Mouth**: Can occur with opioid use.\\n\\nSerious side effects may include:\\n1. **Respiratory Depression**: Slowed breathing can be life-threatening and requires immediate medical attention.\\n2. **Addiction, Abuse, and Misuse**: Oxycodone has a high potential for addiction.\\n3. **Withdrawal Symptoms**: These can occur if the medication is stopped abruptly after prolonged use.\\n4. **Interactions with Other Medications**: Can have harmful interactions with other drugs, especially other CNS depressants.\\n5. **Severe Allergic Reactions**: Symptoms such as rash, itching, swelling, severe dizziness, or trouble breathing require immediate medical attention.\\n\\nAlways consult a healthcare provider for personalized advice and to discuss the risks and benefits before starting any new medication.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 379, \"prompt_tokens\": 34, \"total_tokens\": 413}}}20.1365062025-11-14 05:43:14.0161782025-11-14 05:43:14.016179
9f84346869014412a1fd55ad52bd23b1960997700d79409680974700c0d678251{\"request_id\": \"38ddc04e-b9df-92d8-936a-fbd4b4b5f407\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: 81% aspirin contain resin and shellac in it. ?.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-Cbgpaso97P0FcGhMa2DzrKllg1yEq\", \"object\": \"chat.completion\", \"created\": 1763098814, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Aspirin, including low-dose formulations like 81 mg aspirin, is widely used for its anti-inflammatory, analgesic, and antipyretic properties and its potential protective effects against heart attack and stroke. Here's a detailed list of benefits and side effects associated with aspirin:\\n\\n### Benefits:\\n1. **Cardiovascular Protection:** Low-dose aspirin is commonly prescribed to reduce the risk of heart attack and stroke, particularly in individuals with cardiovascular disease or risk factors.\\n2. **Pain Relief:** Aspirin provides relief from mild to moderate pain, such as headaches, muscle aches, toothaches, and menstrual cramps.\\n3. **Anti-Inflammatory Effects:** It helps reduce inflammation from conditions such as arthritis.\\n4. **Antipyretic Effects:** Aspirin can help reduce fever.\\n\\n### Side Effects:\\n1. **Gastrointestinal Issues:** These can include stomach pain, heartburn, nausea, and more serious effects like gastrointestinal bleeding or ulcers.\\n2. **Increased Bleeding Risk:** Aspirin affects blood clotting, which can lead to an increased risk of bleeding, especially if used long term.\\n3. **Allergic Reactions:** Some individuals may experience allergic reactions to aspirin, including hives, facial swelling, asthma, or anaphylaxis.\\n4. **Kidney and Liver Effects:** Prolonged use of aspirin can potentially impact kidney and liver function, though this is more common with high doses or long-term use.\\n5. **Reye's Syndrome:** Aspirin is linked to Reye's syndrome when given to children with viral infections; thus, it is generally avoided in pediatric medicine for viral illnesses.\\n6. **Ringing in the Ears:** Also known as tinnitus, this can occur at higher doses of aspirin.\\n\\nRegarding the mention of resin and shellac, these are commonly used as excipients or coating materials in the manufacturing of pills, including some aspirin products. They generally serve to protect the pill, control its release into the system, or enhance its appearance. These components are usually considered safe, but people with specific allergies or sensitivities might need to be cautious.\\n\\nAlways consult a healthcare provider before starting or changing a medication regimen to ensure it aligns with your health needs and conditions.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 447, \"prompt_tokens\": 36, \"total_tokens\": 483}}}, \"final_result\": {\"id\": \"chatcmpl-Cbgpaso97P0FcGhMa2DzrKllg1yEq\", \"object\": \"chat.completion\", \"created\": 1763098814, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Aspirin, including low-dose formulations like 81 mg aspirin, is widely used for its anti-inflammatory, analgesic, and antipyretic properties and its potential protective effects against heart attack and stroke. Here's a detailed list of benefits and side effects associated with aspirin:\\n\\n### Benefits:\\n1. **Cardiovascular Protection:** Low-dose aspirin is commonly prescribed to reduce the risk of heart attack and stroke, particularly in individuals with cardiovascular disease or risk factors.\\n2. **Pain Relief:** Aspirin provides relief from mild to moderate pain, such as headaches, muscle aches, toothaches, and menstrual cramps.\\n3. **Anti-Inflammatory Effects:** It helps reduce inflammation from conditions such as arthritis.\\n4. **Antipyretic Effects:** Aspirin can help reduce fever.\\n\\n### Side Effects:\\n1. **Gastrointestinal Issues:** These can include stomach pain, heartburn, nausea, and more serious effects like gastrointestinal bleeding or ulcers.\\n2. **Increased Bleeding Risk:** Aspirin affects blood clotting, which can lead to an increased risk of bleeding, especially if used long term.\\n3. **Allergic Reactions:** Some individuals may experience allergic reactions to aspirin, including hives, facial swelling, asthma, or anaphylaxis.\\n4. **Kidney and Liver Effects:** Prolonged use of aspirin can potentially impact kidney and liver function, though this is more common with high doses or long-term use.\\n5. **Reye's Syndrome:** Aspirin is linked to Reye's syndrome when given to children with viral infections; thus, it is generally avoided in pediatric medicine for viral illnesses.\\n6. **Ringing in the Ears:** Also known as tinnitus, this can occur at higher doses of aspirin.\\n\\nRegarding the mention of resin and shellac, these are commonly used as excipients or coating materials in the manufacturing of pills, including some aspirin products. They generally serve to protect the pill, control its release into the system, or enhance its appearance. These components are usually considered safe, but people with specific allergies or sensitivities might need to be cautious.\\n\\nAlways consult a healthcare provider before starting or changing a medication regimen to ensure it aligns with your health needs and conditions.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 447, \"prompt_tokens\": 36, \"total_tokens\": 483}}}26.6376532025-11-14 05:43:14.0161802025-11-14 05:43:14.016180
98a14a854ba94f09b7a7b48c30878f3b960997700d79409680974700c0d678251{\"request_id\": \"b3487dc5-1c06-9c24-8493-fcc226b751a3\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: what is desonide ointment used for.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpeWG1wvBScw4hRug9GmhLBfQYr\", \"object\": \"chat.completion\", \"created\": 1763098818, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Desonide ointment is a topical corticosteroid used primarily to treat various skin conditions by reducing inflammation, redness, and itching. Here are the benefits and potential side effects of using desonide ointment:\\n\\n### Benefits\\n1. **Reduces Inflammation:** It helps alleviate inflammation associated with conditions like eczema, dermatitis, and psoriasis.\\n2. **Decreases Itching:** By calming the skin, desonide reduces the urge to scratch, which can improve comfort.\\n3. **Reduces Redness:** The ointment helps in minimizing redness and swelling of the affected areas.\\n4. **Promotes Healing:** By managing symptoms like inflammation and itching, it supports faster healing of the skin.\\n5. **Mild Formulation:** As a low-potency steroid, it is often recommended for sensitive skin areas and for use in children.\\n\\n### Side Effects\\nWhile desonide is considered low-potency and generally well-tolerated, some potential side effects may occur:\\n\\n1. **Skin Irritation:** This can include burning, itching, or dryness at the application site.\\n2. **Contact Dermatitis:** Some individuals may experience an allergic reaction leading to a rash.\\n3. **Hypopigmentation:** Prolonged use might cause lightening of the skin.\\n4. **Thinning of Skin:** Extended use, especially under occlusive dressings, can lead to skin thinning.\\n5. **Stretch Marks:** May occur with long-term use in certain areas.\\n6. **Increased Hair Growth:** Some people may notice more hair growth in the treated area.\\n7. **Systemic Absorption:** Though rare, especially with topical use, large amounts or prolonged use can lead to systemic absorption with corticosteroid-related side effects.\\n\\nIt is important to use desonide ointment as directed by a healthcare provider. If you experience severe side effects or signs of an allergic reaction, such as rash, swelling, or difficulty breathing, seek medical attention promptly.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 395, \"prompt_tokens\": 33, \"total_tokens\": 428}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpeWG1wvBScw4hRug9GmhLBfQYr\", \"object\": \"chat.completion\", \"created\": 1763098818, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Desonide ointment is a topical corticosteroid used primarily to treat various skin conditions by reducing inflammation, redness, and itching. Here are the benefits and potential side effects of using desonide ointment:\\n\\n### Benefits\\n1. **Reduces Inflammation:** It helps alleviate inflammation associated with conditions like eczema, dermatitis, and psoriasis.\\n2. **Decreases Itching:** By calming the skin, desonide reduces the urge to scratch, which can improve comfort.\\n3. **Reduces Redness:** The ointment helps in minimizing redness and swelling of the affected areas.\\n4. **Promotes Healing:** By managing symptoms like inflammation and itching, it supports faster healing of the skin.\\n5. **Mild Formulation:** As a low-potency steroid, it is often recommended for sensitive skin areas and for use in children.\\n\\n### Side Effects\\nWhile desonide is considered low-potency and generally well-tolerated, some potential side effects may occur:\\n\\n1. **Skin Irritation:** This can include burning, itching, or dryness at the application site.\\n2. **Contact Dermatitis:** Some individuals may experience an allergic reaction leading to a rash.\\n3. **Hypopigmentation:** Prolonged use might cause lightening of the skin.\\n4. **Thinning of Skin:** Extended use, especially under occlusive dressings, can lead to skin thinning.\\n5. **Stretch Marks:** May occur with long-term use in certain areas.\\n6. **Increased Hair Growth:** Some people may notice more hair growth in the treated area.\\n7. **Systemic Absorption:** Though rare, especially with topical use, large amounts or prolonged use can lead to systemic absorption with corticosteroid-related side effects.\\n\\nIt is important to use desonide ointment as directed by a healthcare provider. If you experience severe side effects or signs of an allergic reaction, such as rash, swelling, or difficulty breathing, seek medical attention promptly.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 395, \"prompt_tokens\": 33, \"total_tokens\": 428}}}30.8816902025-11-14 05:43:14.0161812025-11-14 05:43:14.016181
6c34dd7a4bba4f4e8bd9bff5ab9afa1f960997700d79409680974700c0d678251{\"request_id\": \"fd5cbafd-b66b-9b97-93c2-eebcdc27661d\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: how soon can tylenol be taken after a cocktail?.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpiVkYdzlpHQQsRHqvnGJfohWuk\", \"object\": \"chat.completion\", \"created\": 1763098822, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"### Benefits of Tylenol (Acetaminophen)\\n\\n1. **Pain Relief:** \\n - Tylenol is commonly used to relieve mild to moderate pain from headaches, muscle aches, menstrual periods, colds and sore throats, toothaches, and backaches.\\n\\n2. **Fever Reduction:**\\n - It is effective in reducing fever.\\n\\n3. **Generally Safe:**\\n - When taken at recommended doses, Tylenol is generally safe for most people and can be used by those who cannot take NSAIDs (non-steroidal anti-inflammatory drugs) like aspirin, ibuprofen, or naproxen.\\n\\n4. **Well-Tolerated:**\\n - It generally causes fewer gastrointestinal side effects than NSAIDs.\\n\\n### Side Effects of Tylenol (Acetaminophen)\\n\\n1. **Liver Damage:**\\n - High doses or prolonged use can cause severe liver damage. This risk is increased with excessive alcohol consumption.\\n\\n2. **Allergic Reactions:**\\n - Rarely, some people may experience allergic reactions like rash, itching, swelling, severe dizziness, or trouble breathing.\\n\\n3. **Kidney Damage:**\\n - Long-term use can potentially lead to kidney damage, especially at high doses.\\n\\n### Regarding Alcohol Consumption\\n\\n- **Timing:** \\n - It is generally advised to wait at least 3-4 hours after consuming a cocktail before taking Tylenol. This allows your body to process the alcohol and reduces the risk of liver damage. However, depending on the amount of alcohol consumed and individual health considerations, it might be better to consult a healthcare professional for personalized advice.\\n\\n- **Risk of Liver Damage:**\\n - Both alcohol and acetaminophen are processed via the liver, and taking them together, especially in large amounts or over a long period, can increase the risk of liver damage.\\n\\nIt is always important to follow dosing instructions on the medication label and to consult with a healthcare professional if you have specific concerns or health conditions that might affect how you should use Tylenol.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 414, \"prompt_tokens\": 35, \"total_tokens\": 449}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpiVkYdzlpHQQsRHqvnGJfohWuk\", \"object\": \"chat.completion\", \"created\": 1763098822, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"### Benefits of Tylenol (Acetaminophen)\\n\\n1. **Pain Relief:** \\n - Tylenol is commonly used to relieve mild to moderate pain from headaches, muscle aches, menstrual periods, colds and sore throats, toothaches, and backaches.\\n\\n2. **Fever Reduction:**\\n - It is effective in reducing fever.\\n\\n3. **Generally Safe:**\\n - When taken at recommended doses, Tylenol is generally safe for most people and can be used by those who cannot take NSAIDs (non-steroidal anti-inflammatory drugs) like aspirin, ibuprofen, or naproxen.\\n\\n4. **Well-Tolerated:**\\n - It generally causes fewer gastrointestinal side effects than NSAIDs.\\n\\n### Side Effects of Tylenol (Acetaminophen)\\n\\n1. **Liver Damage:**\\n - High doses or prolonged use can cause severe liver damage. This risk is increased with excessive alcohol consumption.\\n\\n2. **Allergic Reactions:**\\n - Rarely, some people may experience allergic reactions like rash, itching, swelling, severe dizziness, or trouble breathing.\\n\\n3. **Kidney Damage:**\\n - Long-term use can potentially lead to kidney damage, especially at high doses.\\n\\n### Regarding Alcohol Consumption\\n\\n- **Timing:** \\n - It is generally advised to wait at least 3-4 hours after consuming a cocktail before taking Tylenol. This allows your body to process the alcohol and reduces the risk of liver damage. However, depending on the amount of alcohol consumed and individual health considerations, it might be better to consult a healthcare professional for personalized advice.\\n\\n- **Risk of Liver Damage:**\\n - Both alcohol and acetaminophen are processed via the liver, and taking them together, especially in large amounts or over a long period, can increase the risk of liver damage.\\n\\nIt is always important to follow dosing instructions on the medication label and to consult with a healthcare professional if you have specific concerns or health conditions that might affect how you should use Tylenol.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 414, \"prompt_tokens\": 35, \"total_tokens\": 449}}}34.0473912025-11-14 05:43:14.0161822025-11-14 05:43:14.016182
b8c446bc2c2c423e9d60b1e7511f5c36960997700d79409680974700c0d678251{\"request_id\": \"ad1459b8-f922-9fd0-ae8b-beb41d2bfc9a\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: breo inhaler how it works.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-Cbgpm1fLXLizRqds95YiTnF1MgSfk\", \"object\": \"chat.completion\", \"created\": 1763098826, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"The Breo inhaler, also known by its generic components fluticasone furoate and vilanterol, is used primarily to treat asthma and chronic obstructive pulmonary disease (COPD). Here's how it works and its potential benefits and side effects:\\n\\n### How It Works:\\n- **Fluticasone Furoate**: This is a corticosteroid that reduces inflammation in the airways, helping to prevent asthma attacks and improve breathing.\\n- **Vilanterol**: This is a long-acting beta-agonist (LABA) that relaxes muscles in the airways to improve airflow and breathing.\\n\\n### Benefits:\\n- **Improved Breathing**: Helps ease breathing by reducing airway inflammation and relaxing airway muscles.\\n- **Reduced Symptoms**: Decreases the frequency and severity of asthma attacks and COPD flare-ups.\\n- **Long-Lasting Relief**: Provides relief that can last for 24 hours, meaning it typically needs to be used once a day.\\n- **Convenience**: Combines two medications in a single inhaler, simplifying treatment.\\n\\n### Potential Side Effects:\\n- **Common Side Effects**:\\n - Throat irritation or hoarseness\\n - Headache\\n - Cough\\n - Back pain\\n\\n- **Serious Side Effects** (require immediate medical attention):\\n - Difficulty breathing or swallowing\\n - Signs of an allergic reaction (rash, itching, swelling, severe dizziness)\\n - High blood pressure\\n - Increased heart rate\\n - Chest pain\\n - Eye problems like blurred vision or eye pain\\n\\n- **Long-term Use Side Effects**:\\n - Potential for increased risk of pneumonia in COPD patients\\n - Possible risk of osteoporosis\\n - Oral thrush (a fungal infection in the mouth) \\u2013 to reduce this risk, rinse your mouth with water after each use\\n\\nIt's important for users to follow their healthcare provider's instructions and not to use the Breo inhaler to treat sudden breathing problems. This medication is meant for long-term management, and a rescue inhaler should be used for acute symptoms. Always consult a healthcare professional for personalized advice and guidance.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 435, \"prompt_tokens\": 31, \"total_tokens\": 466}}}, \"final_result\": {\"id\": \"chatcmpl-Cbgpm1fLXLizRqds95YiTnF1MgSfk\", \"object\": \"chat.completion\", \"created\": 1763098826, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"The Breo inhaler, also known by its generic components fluticasone furoate and vilanterol, is used primarily to treat asthma and chronic obstructive pulmonary disease (COPD). Here's how it works and its potential benefits and side effects:\\n\\n### How It Works:\\n- **Fluticasone Furoate**: This is a corticosteroid that reduces inflammation in the airways, helping to prevent asthma attacks and improve breathing.\\n- **Vilanterol**: This is a long-acting beta-agonist (LABA) that relaxes muscles in the airways to improve airflow and breathing.\\n\\n### Benefits:\\n- **Improved Breathing**: Helps ease breathing by reducing airway inflammation and relaxing airway muscles.\\n- **Reduced Symptoms**: Decreases the frequency and severity of asthma attacks and COPD flare-ups.\\n- **Long-Lasting Relief**: Provides relief that can last for 24 hours, meaning it typically needs to be used once a day.\\n- **Convenience**: Combines two medications in a single inhaler, simplifying treatment.\\n\\n### Potential Side Effects:\\n- **Common Side Effects**:\\n - Throat irritation or hoarseness\\n - Headache\\n - Cough\\n - Back pain\\n\\n- **Serious Side Effects** (require immediate medical attention):\\n - Difficulty breathing or swallowing\\n - Signs of an allergic reaction (rash, itching, swelling, severe dizziness)\\n - High blood pressure\\n - Increased heart rate\\n - Chest pain\\n - Eye problems like blurred vision or eye pain\\n\\n- **Long-term Use Side Effects**:\\n - Potential for increased risk of pneumonia in COPD patients\\n - Possible risk of osteoporosis\\n - Oral thrush (a fungal infection in the mouth) \\u2013 to reduce this risk, rinse your mouth with water after each use\\n\\nIt's important for users to follow their healthcare provider's instructions and not to use the Breo inhaler to treat sudden breathing problems. This medication is meant for long-term management, and a rescue inhaler should be used for acute symptoms. Always consult a healthcare professional for personalized advice and guidance.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 435, \"prompt_tokens\": 31, \"total_tokens\": 466}}}38.3906542025-11-14 05:43:14.0161832025-11-14 05:43:14.016183
4b6e2f1ddefa4f1585977e1cbe2fa900960997700d79409680974700c0d678251{\"request_id\": \"7ed2ae00-47ea-97fc-8d72-69f143a42abc\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: breo inhaler how it works.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-Cbgpq82kWg8SHVgUnPQhAFvm7VaIR\", \"object\": \"chat.completion\", \"created\": 1763098830, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Breo Ellipta is an inhaler that contains two active ingredients, fluticasone furoate and vilanterol. It is commonly used for managing asthma and chronic obstructive pulmonary disease (COPD). Here's how it works, along with its benefits and potential side effects:\\n\\n### How It Works\\n- **Fluticasone Furoate**: This is a corticosteroid that helps reduce inflammation in the lungs, making it easier to breathe.\\n- **Vilanterol**: This is a long-acting beta-agonist (LABA) that relaxes muscles in the airways to improve airflow and reduce symptoms like wheezing and shortness of breath.\\n\\n### Benefits\\n- **Improved Breathing**: By reducing inflammation and relaxing airway muscles, the inhaler helps improve overall breathing.\\n- **Symptom Control**: Helps control and prevent symptoms such as wheezing, coughing, chest tightness, and shortness of breath.\\n- **Convenience**: Once-daily dosing can improve adherence and make it easier for patients to manage their condition.\\n- **Improved Quality of Life**: Better management of asthma or COPD symptoms can lead to an improved ability to carry out daily activities.\\n\\n### Potential Side Effects\\n- **Common Side Effects**: These may include headaches, sore throat, cough, and oral thrush (a fungal infection in the mouth).\\n- **Muscle Pain**: Some people may experience muscle and joint pain.\\n- **Increased Heart Rate**: Vilanterol may cause an increased heart rate or palpitations.\\n- **Risk of Infections**: Corticosteroids can suppress the immune system, increasing the risk of infections, such as pneumonia.\\n- **Hoarseness or Voice Changes**: Inhaled corticosteroids can affect the vocal cords.\\n- **Paradoxical Bronchospasm**: Rarely, inhalers can cause bronchospasm, which is a tightening of the muscles around the airways.\\n\\nIt's important for patients to discuss their symptoms and any side effects they experience with their healthcare provider, who can offer guidance tailored to individual health needs.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 431, \"prompt_tokens\": 31, \"total_tokens\": 462}}}, \"final_result\": {\"id\": \"chatcmpl-Cbgpq82kWg8SHVgUnPQhAFvm7VaIR\", \"object\": \"chat.completion\", \"created\": 1763098830, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Breo Ellipta is an inhaler that contains two active ingredients, fluticasone furoate and vilanterol. It is commonly used for managing asthma and chronic obstructive pulmonary disease (COPD). Here's how it works, along with its benefits and potential side effects:\\n\\n### How It Works\\n- **Fluticasone Furoate**: This is a corticosteroid that helps reduce inflammation in the lungs, making it easier to breathe.\\n- **Vilanterol**: This is a long-acting beta-agonist (LABA) that relaxes muscles in the airways to improve airflow and reduce symptoms like wheezing and shortness of breath.\\n\\n### Benefits\\n- **Improved Breathing**: By reducing inflammation and relaxing airway muscles, the inhaler helps improve overall breathing.\\n- **Symptom Control**: Helps control and prevent symptoms such as wheezing, coughing, chest tightness, and shortness of breath.\\n- **Convenience**: Once-daily dosing can improve adherence and make it easier for patients to manage their condition.\\n- **Improved Quality of Life**: Better management of asthma or COPD symptoms can lead to an improved ability to carry out daily activities.\\n\\n### Potential Side Effects\\n- **Common Side Effects**: These may include headaches, sore throat, cough, and oral thrush (a fungal infection in the mouth).\\n- **Muscle Pain**: Some people may experience muscle and joint pain.\\n- **Increased Heart Rate**: Vilanterol may cause an increased heart rate or palpitations.\\n- **Risk of Infections**: Corticosteroids can suppress the immune system, increasing the risk of infections, such as pneumonia.\\n- **Hoarseness or Voice Changes**: Inhaled corticosteroids can affect the vocal cords.\\n- **Paradoxical Bronchospasm**: Rarely, inhalers can cause bronchospasm, which is a tightening of the muscles around the airways.\\n\\nIt's important for patients to discuss their symptoms and any side effects they experience with their healthcare provider, who can offer guidance tailored to individual health needs.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 431, \"prompt_tokens\": 31, \"total_tokens\": 462}}}41.6401132025-11-14 05:43:14.0161842025-11-14 05:43:14.016184
ac009b3ba51c470ea3d672e1dc0fc290960997700d79409680974700c0d678251{\"request_id\": \"b2b05a40-a5a0-9a34-933b-df8af63c91a7\", \"intermediate_results\": {\"templating\": [{\"content\": \"List the benefits and side effects of the drug in the following consumer health question: qvar 40mg what is it for.\", \"role\": \"user\"}], \"llm\": {\"id\": \"chatcmpl-CbgpuOliFS4lXR3FtBVPP7oeSZOho\", \"object\": \"chat.completion\", \"created\": 1763098834, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Qvar (beclomethasone dipropionate) is an inhaled corticosteroid medication commonly used for the maintenance treatment of asthma. It helps to prevent and control symptoms such as wheezing and shortness of breath by reducing inflammation in the airways. It is not intended for the relief of acute asthma attacks.\\n\\n### Benefits:\\n1. **Reduces Inflammation**: Decreases swelling and irritation in the airways, helping to prevent asthma symptoms.\\n2. **Improves Breathing**: Helps to open airways, making it easier to breathe.\\n3. **Fewer Asthma Attacks**: Regular use can lead to fewer and milder asthma attacks.\\n4. **Maintenance of Asthma Control**: Helps in maintaining long-term control of asthma symptoms.\\n\\n### Potential Side Effects:\\n1. **Throat Irritation**: May cause a sore throat or hoarseness.\\n2. **Oral Thrush**: A fungal infection in the mouth; can be minimized by rinsing the mouth after use.\\n3. **Cough**: Some users may experience a cough immediately after use.\\n4. **Nasal Congestion**: Stuffy nose might occur in some individuals.\\n5. **Headache**: A common mild side effect.\\n6. **Potential for Delayed Growth in Children**: Long-term use can affect growth rates in children, so growth should be monitored.\\n7. **Other Systemic Effects**: While rare with inhaled steroids, potential side effects could include changes in mood, adrenal suppression, and bone density reduction if used at high doses for prolonged periods.\\n\\nIt is essential for users to follow their healthcare provider\\u2019s instructions and discuss any concerns or side effects they experience.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 348, \"prompt_tokens\": 33, \"total_tokens\": 381}}}, \"final_result\": {\"id\": \"chatcmpl-CbgpuOliFS4lXR3FtBVPP7oeSZOho\", \"object\": \"chat.completion\", \"created\": 1763098834, \"model\": \"gpt-4o-2024-08-06\", \"system_fingerprint\": \"fp_4a331a0222\", \"choices\": [{\"index\": 0, \"message\": {\"role\": \"assistant\", \"content\": \"Qvar (beclomethasone dipropionate) is an inhaled corticosteroid medication commonly used for the maintenance treatment of asthma. It helps to prevent and control symptoms such as wheezing and shortness of breath by reducing inflammation in the airways. It is not intended for the relief of acute asthma attacks.\\n\\n### Benefits:\\n1. **Reduces Inflammation**: Decreases swelling and irritation in the airways, helping to prevent asthma symptoms.\\n2. **Improves Breathing**: Helps to open airways, making it easier to breathe.\\n3. **Fewer Asthma Attacks**: Regular use can lead to fewer and milder asthma attacks.\\n4. **Maintenance of Asthma Control**: Helps in maintaining long-term control of asthma symptoms.\\n\\n### Potential Side Effects:\\n1. **Throat Irritation**: May cause a sore throat or hoarseness.\\n2. **Oral Thrush**: A fungal infection in the mouth; can be minimized by rinsing the mouth after use.\\n3. **Cough**: Some users may experience a cough immediately after use.\\n4. **Nasal Congestion**: Stuffy nose might occur in some individuals.\\n5. **Headache**: A common mild side effect.\\n6. **Potential for Delayed Growth in Children**: Long-term use can affect growth rates in children, so growth should be monitored.\\n7. **Other Systemic Effects**: While rare with inhaled steroids, potential side effects could include changes in mood, adrenal suppression, and bone density reduction if used at high doses for prolonged periods.\\n\\nIt is essential for users to follow their healthcare provider\\u2019s instructions and discuss any concerns or side effects they experience.\"}, \"finish_reason\": \"stop\"}], \"usage\": {\"completion_tokens\": 348, \"prompt_tokens\": 33, \"total_tokens\": 381}}}44.5463812025-11-14 05:43:14.0161852025-11-14 05:43:14.016185
\n", + "
\n", + " \n", + "
\n", + "

Table: evaluation_result

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
submission_idrun_idrepetition_countmetricaggregating_valuemetric_resulterrorcreated_atupdated_at
f1132352cb524392b733ca0773c6bef0960997700d79409680974700c0d678251\"BERT Score/F1\"0.522608{\"precision\": 0.4782029390335083, \"recall\": 0.5761047601699829, \"f1\": 0.5226083397865295}None2025-11-14 05:52:03.1291652025-11-14 05:52:03.129169
ce3fd3f1f8f04499847153fb1d3a2b0f960997700d79409680974700c0d678251\"BERT Score/F1\"0.490477{\"precision\": 0.4271319508552551, \"recall\": 0.5758814811706543, \"f1\": 0.4904767870903015}None2025-11-14 05:52:03.1291702025-11-14 05:52:03.129171
e0a2c70183dd4ce289772b5ae74957b6960997700d79409680974700c0d678251\"BERT Score/F1\"0.479421{\"precision\": 0.4179512560367584, \"recall\": 0.5620903968811035, \"f1\": 0.4794212579727173}None2025-11-14 05:52:03.1291712025-11-14 05:52:03.129174
61ca98967e5b45f4865db10d81de298b960997700d79409680974700c0d678251\"BERT Score/F1\"0.343647{\"precision\": 0.31056681275367737, \"recall\": 0.3846154510974884, \"f1\": 0.34364742040634155}None2025-11-14 05:52:03.1291752025-11-14 05:52:03.129176
9f84346869014412a1fd55ad52bd23b1960997700d79409680974700c0d678251\"BERT Score/F1\"0.331058{\"precision\": 0.27349621057510376, \"recall\": 0.4193095266819, \"f1\": 0.3310583531856537}None2025-11-14 05:52:03.1291762025-11-14 05:52:03.129177
98a14a854ba94f09b7a7b48c30878f3b960997700d79409680974700c0d678251\"BERT Score/F1\"0.532768{\"precision\": 0.457075834274292, \"recall\": 0.6385050415992737, \"f1\": 0.5327680706977844}None2025-11-14 05:52:03.1291772025-11-14 05:52:03.129178
6c34dd7a4bba4f4e8bd9bff5ab9afa1f960997700d79409680974700c0d678251\"BERT Score/F1\"0.549074{\"precision\": 0.5064661502838135, \"recall\": 0.5995101928710938, \"f1\": 0.5490743517875671}None2025-11-14 05:52:03.1291782025-11-14 05:52:03.129179
b8c446bc2c2c423e9d60b1e7511f5c36960997700d79409680974700c0d678251\"BERT Score/F1\"0.585487{\"precision\": 0.5031953454017639, \"recall\": 0.6999572515487671, \"f1\": 0.5854871869087219}None2025-11-14 05:52:03.1291802025-11-14 05:52:03.129180
4b6e2f1ddefa4f1585977e1cbe2fa900960997700d79409680974700c0d678251\"BERT Score/F1\"0.506072{\"precision\": 0.49769705533981323, \"recall\": 0.5147332549095154, \"f1\": 0.5060718655586243}None2025-11-14 05:52:03.1291812025-11-14 05:52:03.129181
ac009b3ba51c470ea3d672e1dc0fc290960997700d79409680974700c0d678251\"BERT Score/F1\"0.509633{\"precision\": 0.4473775625228882, \"recall\": 0.5920162200927734, \"f1\": 0.5096331238746643}None2025-11-14 05:52:03.1291822025-11-14 05:52:03.129182
\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# viewing the results from sqlite db in tabular format..\n", + "import sqlite3\n", + "import pandas as pd\n", + "from IPython.display import display, HTML\n", + "\n", + "# Path to your SQLite database file\n", + "db_file = 'results-new/results.db'\n", + "\n", + "connection = sqlite3.connect(db_file)\n", + "\n", + "# Specify the table names you want to display\n", + "table_names = ['run','configuration', 'submission', 'submission_result', 'evaluation_result'] \n", + "\n", + "# Create the CSS and HTML container\n", + "html_content = \"\"\"\n", + "\n", + "
\n", + "\"\"\"\n", + "\n", + "for table_name in table_names:\n", + " query = f\"SELECT * FROM {table_name};\"\n", + " df = pd.read_sql_query(query, connection)\n", + " # If you want to see all the rows across all tables, remove/comment the next line\n", + " df = df.head(10) # Limiting the number of rows displayed\n", + " table_html = df.to_html(classes='table-container', index=False)\n", + " html_content += f\"\"\"\n", + "
\n", + "

Table: {table_name}

\n", + " {table_html}\n", + "
\n", + " \"\"\"\n", + "\n", + "html_content += \"
\"\n", + "\n", + "display(HTML(html_content))\n", + "\n", + "# Close the connection\n", + "connection.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "{'id': 'ef8963f68ec99242', 'message': 'Deletion scheduled', 'targetStatus': 'DELETED'}\n" + ] + } + ], + "source": [ + "#Delete Execution Id\n", + "def delete_execution():\n", + " headers = _get_headers()\n", + " EXEC_ID = execution_id\n", + " GET_EXECUTIONS_ENDPOINT = '/v2/lm/executions/'\n", + " request_url = f\"{AICORE_BASE_URL}{GET_EXECUTIONS_ENDPOINT}{EXEC_ID}\"\n", + " try:\n", + " response = requests.delete(\n", + " request_url, headers=headers, params={\"AI-Resource-Group\":AICORE_RESOURCE_GROUP}, timeout=120\n", + " )\n", + " print(response)\n", + " if(response.status_code != 202):\n", + " raise\n", + " result = response.json()\n", + " print(result)\n", + " except:\n", + " logging.error(\"Error occurred while attempting to delete a Configuration\")\n", + " raise\n", + " \n", + "delete_execution()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/ai-core-genaihub-evaluation/img/AI_Core.json b/tutorials/ai-core-genaihub-evaluation/img/AI_Core.json new file mode 100644 index 000000000..56a807a86 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/img/AI_Core.json @@ -0,0 +1,1578 @@ +{ + "name": "AI Core", + "version": "1", + "items": [ + { + "type": "http", + "name": "get_token", + "filename": "get_token.bru", + "seq": 1, + "request": { + "url": "{{ai_auth_url}}/oauth/token", + "method": "POST", + "headers": [ + { + "name": "Content-Type", + "value": "application/x-www-form-urlencoded", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "formUrlEncoded", + "formUrlEncoded": [ + { + "name": "grant_type", + "value": "client_credentials", + "enabled": true + }, + { + "name": "client_id", + "value": "{{client_id}}", + "enabled": true + }, + { + "name": "client_secret", + "value": "{{client_secret}}", + "enabled": true + } + ], + "multipartForm": [], + "file": [] + }, + "script": { + "res": "if (res.getStatus() == 200) {\n bru.setEnvVar(\"access_token\", res.body.access_token);\n}" + }, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "", + "auth": { + "mode": "none" + } + } + }, + { + "type": "folder", + "name": "admin", + "filename": "admin", + "root": { + "meta": { + "name": "admin" + } + }, + "items": [ + { + "type": "folder", + "name": "objectStoreSecrets", + "filename": "objectStoreSecrets", + "root": { + "meta": { + "name": "objectStoreSecrets" + } + }, + "items": [ + { + "type": "http", + "name": "Create a secret", + "filename": "Create a secret.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/admin/objectStoreSecrets", + "method": "POST", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + }, + { + "name": "Authorization", + "value": "", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "json", + "json": "{\n \"name\": \"genai-data\",\n \"data\": {\n \"AWS_ACCESS_KEY_ID\": \"\",\n \"AWS_SECRET_ACCESS_KEY\": \"\"\n },\n \"type\": \"S3\",\n \"bucket\": \"\",\n \"endpoint\": \"https://s3.eu-central-1.amazonaws.com\",\n \"region\": \"\",\n \"pathPrefix\": \"\" \n }", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Create a secret based on the configuration in the request body\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Get a list of metadata of available secrets.", + "filename": "Get a list of metadata of available secrets.bru", + "seq": 2, + "request": { + "url": "{{baseUrl}}/admin/objectStoreSecrets?$top=&$skip=&$count=", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "$top", + "value": "", + "type": "query", + "enabled": true + }, + { + "name": "$skip", + "value": "", + "type": "query", + "enabled": true + }, + { + "name": "$count", + "value": "", + "type": "query", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve a list of metadata of the stored secrets.\n", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + }, + { + "type": "folder", + "name": "{objectStoreName}", + "filename": "{objectStoreName}", + "root": { + "meta": { + "name": "{objectStoreName}" + } + }, + "items": [ + { + "type": "http", + "name": "Delete object store secret", + "filename": "Delete object store secret.bru", + "seq": 2, + "request": { + "url": "{{baseUrl}}/admin/objectStoreSecrets/:objectStoreName", + "method": "DELETE", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "objectStoreName", + "value": "qKoZ-aHSe", + "type": "path", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Delete a secret with the name of objectStoreName if it exists.", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + }, + { + "type": "http", + "name": "Returns the of metadata of secrets which match the query parameter.", + "filename": "Returns the of metadata of secrets which match the query parameter.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/admin/objectStoreSecrets", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "This retrieves the metadata of the stored secret which match the parameter objectStoreName.\nThe fetched secret is constructed like objectStoreName-object-store-secret\nThe base64 encoded field for the stored secret is not returned.\n", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + }, + { + "type": "http", + "name": "Update object store secret", + "filename": "Update object store secret.bru", + "seq": 3, + "request": { + "url": "{{baseUrl}}/admin/objectStoreSecrets/:objectStoreName", + "method": "PATCH", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "objectStoreName", + "value": "qKoZ-aHSe", + "type": "path", + "enabled": true + } + ], + "body": { + "mode": "json", + "json": "{\n \"name\": \"\",\n \"type\": \"\",\n \"data\": {},\n \"bucket\": \"\",\n \"endpoint\": \"\",\n \"region\": \"\",\n \"pathPrefix\": \"\",\n \"verifyssl\": \"\",\n \"usehttps\": \"1\"\n}", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Update a secret with name of objectStoreName if it exists.\n", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + } + ] + } + ] + } + ] + }, + { + "type": "folder", + "name": "lm", + "filename": "lm", + "root": { + "meta": { + "name": "lm" + } + }, + "items": [ + { + "type": "folder", + "name": "configurations", + "filename": "configurations", + "root": { + "meta": { + "name": "configurations" + } + }, + "items": [ + { + "type": "http", + "name": "Create configuration Copy", + "filename": "Create configuration Copy.bru", + "seq": 3, + "request": { + "url": "{{baseUrl}}/v2/lm/configurations", + "method": "DELETE", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "json", + "json": "{\n \"id\": \"\"\n}", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Create a new configuration linked to a specific scenario and executable for use in an execution\nor deployment.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Create configuration", + "filename": "Create configuration.bru", + "seq": 2, + "request": { + "url": "{{baseUrl}}/v2/lm/configurations", + "method": "POST", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "json", + "json": "{\n \"name\": \"genai-eval-conf\",\n \"scenarioId\": \"genai-evaluations\",\n \"executableId\": \"genai-evaluations-simplified\",\n \"inputArtifactBindings\": [\n {\n \"key\": \"datasetFolder\",\n \"artifactId\": \"\"\n }\n ],\n \"parameterBindings\": [\n {\n \"key\": \"repetitions\",\n \"value\": \"1\"\n },\n {\n \"key\": \"orchestrationDeploymentURL\",\n \"value\": \"\"\n\n },\n {\n \"key\": \"metrics\",\n \"value\": \"language_match\"\n },\n {\n \"key\": \"testDataset\",\n \"value\": \"{\\\"path\\\": \\\"testdata/global_customer_queries.csv\\\", \\\"type\\\": \\\"csv\\\"}\"\n },\n {\n \"key\": \"promptTemplate\",\n \"value\": \"\"\n },\n {\n \"key\": \"models\",\n \"value\": \"gpt-4.1:latest\"\n }\n ]\n}\n", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Create a new configuration linked to a specific scenario and executable for use in an execution\nor deployment.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Get list of configurations", + "filename": "Get list of configurations.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/lm/configurations", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve a list of configurations. Filter results by scenario ID or a list of executable IDs.\nSearch for configurations containing the search string as substring in the configuration name.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "folder", + "name": "{configurationId}", + "filename": "{configurationId}", + "root": { + "meta": { + "name": "{configurationId}" + } + }, + "items": [ + { + "type": "http", + "name": "Get configuration by ID", + "filename": "Get configuration by ID.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/lm/configurations", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve details for configuration with configurationId.", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + } + ] + }, + { + "type": "folder", + "name": "$count", + "filename": "$count", + "root": { + "meta": { + "name": "$count" + } + }, + "items": [ + { + "type": "http", + "name": "Get number of configurations", + "filename": "Get number of configurations.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/lm/configurations/$count?scenarioId=iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN&$search=}\"NI2Kn!V&searchCaseInsensitive=false&executableIds=T_jtbUJzwg0e.okSV667jeZejqVb,3e0cmfc4c-6YavNz92uztZE", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "text/plain", + "enabled": true + } + ], + "params": [ + { + "name": "scenarioId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": true + }, + { + "name": "$search", + "value": "}\"NI2Kn!V", + "type": "query", + "enabled": true + }, + { + "name": "searchCaseInsensitive", + "value": "false", + "type": "query", + "enabled": true + }, + { + "name": "executableIds", + "value": "T_jtbUJzwg0e.okSV667jeZejqVb,3e0cmfc4c-6YavNz92uztZE", + "type": "query", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve the number of available configurations that match the specified filter criteria.\nFilter criteria include a scenarioId or executableIdsList. Search by substring of configuration name is also possible.\n", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + } + ] + } + ] + }, + { + "type": "folder", + "name": "artifacts", + "filename": "artifacts", + "root": { + "meta": { + "name": "artifacts" + } + }, + "items": [ + { + "type": "http", + "name": "Get list of artifacts", + "filename": "Get list of artifacts.bru", + "seq": 2, + "request": { + "url": "{{baseUrl}}/v2/lm/artifacts", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "scenarioId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": false + }, + { + "name": "executionId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": false + }, + { + "name": "name", + "value": "[G7 ovyt8i", + "type": "query", + "enabled": false + }, + { + "name": "kind", + "value": "other", + "type": "query", + "enabled": false + }, + { + "name": "artifactLabelSelector", + "value": "ext.ai.sap.com/bXN1EAk=D*", + "type": "query", + "enabled": false + }, + { + "name": "$top", + "value": "10000", + "type": "query", + "enabled": false + }, + { + "name": "$skip", + "value": "", + "type": "query", + "enabled": false + }, + { + "name": "$search", + "value": "}\"NI2Kn!V", + "type": "query", + "enabled": false + }, + { + "name": "searchCaseInsensitive", + "value": "false", + "type": "query", + "enabled": false + }, + { + "name": "$expand", + "value": "scenario", + "type": "query", + "enabled": false + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve a list of artifacts that matches the specified filter criteria.\nFilter criteria include scenario ID, execution ID, an artifact name, artifact kind, or artifact labels.\nUse top/skip parameters to paginate the result list.\nSearch by substring of artifact name or description, if required.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Register artifact", + "filename": "Register artifact.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/lm/artifacts", + "method": "POST", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "json", + "json": "{\n \"name\": \"aiconfig\",\n \"kind\": \"dataset\",\n \"url\": \"ai://genai-data/genaiEvaluation/14af1af80b974edb8731632d17286343\",\n \"scenarioId\": \"genai-evaluations\"\n}\n", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Register an artifact for use in a configuration, for example a model or a dataset.", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "folder", + "name": "$count", + "filename": "$count", + "root": { + "meta": { + "name": "$count" + } + }, + "items": [ + { + "type": "http", + "name": "Get number of artifacts", + "filename": "Get number of artifacts.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/lm/artifacts/$count?scenarioId=iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN&executionId=iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN&name=[G7 ovyt8i&kind=other&$search=}\"NI2Kn!V&searchCaseInsensitive=false&artifactLabelSelector=ext.ai.sap.com/bXN1EAk=D*", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "text/plain", + "enabled": true + } + ], + "params": [ + { + "name": "scenarioId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": true + }, + { + "name": "executionId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": true + }, + { + "name": "name", + "value": "[G7 ovyt8i", + "type": "query", + "enabled": true + }, + { + "name": "kind", + "value": "other", + "type": "query", + "enabled": true + }, + { + "name": "$search", + "value": "}\"NI2Kn!V", + "type": "query", + "enabled": true + }, + { + "name": "searchCaseInsensitive", + "value": "false", + "type": "query", + "enabled": true + }, + { + "name": "artifactLabelSelector", + "value": "ext.ai.sap.com/bXN1EAk=D*", + "type": "query", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve the number of available artifacts that match the specified filter criteria.\nFilter criteria include a scenarioId, executionId, an artifact name, artifact kind, or artifact labels.\nSearch by substring of artifact name or description is also possible.\n", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + } + ] + } + ] + }, + { + "type": "folder", + "name": "executions", + "filename": "executions", + "root": { + "meta": { + "name": "executions" + } + }, + "items": [ + { + "type": "http", + "name": "Create execution", + "filename": "Create execution.bru", + "seq": 2, + "request": { + "url": "{{baseUrl}}/v2/lm/executions", + "method": "POST", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "json", + "json": "{\n \"configurationId\": \"\"\n}", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Create an execution using the configuration specified by configurationId.", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Get list of executions", + "filename": "Get list of executions.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/lm/executions/", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "scenarioId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": false + }, + { + "name": "executionScheduleId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": false + }, + { + "name": "status", + "value": "DEAD", + "type": "query", + "enabled": false + }, + { + "name": "$top", + "value": "10000", + "type": "query", + "enabled": false + }, + { + "name": "$skip", + "value": "", + "type": "query", + "enabled": false + }, + { + "name": "$select", + "value": "status", + "type": "query", + "enabled": false + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve a list of executions that match the specified filter criteria.\nFilter criteria include a list of executableIds, a scenarioId, a configurationId, or a execution status.\nWith top/skip parameters it is possible to paginate the result list.\nWith select parameter it is possible to select only status.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "folder", + "name": "$count", + "filename": "$count", + "root": { + "meta": { + "name": "$count" + } + } + } + ] + }, + { + "type": "folder", + "name": "deployments", + "filename": "deployments", + "root": { + "meta": { + "name": "deployments" + } + }, + "items": [ + { + "type": "http", + "name": "Create deployment", + "filename": "Create deployment.bru", + "seq": 2, + "request": { + "url": "{{baseUrl}}/v2/lm/deployments", + "method": "POST", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Content-Type", + "value": "application/json", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "json", + "json": "{\n \"configurationId\": \"\"\n}", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Create a deployment using the configuration specified by configurationId after synchronously checking the\ncorrectness of the configuration.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Get list of deployments", + "filename": "Get list of deployments.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/v2/lm/deployments", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve a list of deployments that match the specified filter criteria.\nFilter criteria include a list of executableIds, a scenarioId, a configurationId, or a deployment status.\nWith top/skip parameters it is possible to paginate the result list.\nWith select parameter it is possible to select only status.\n", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "folder", + "name": "$count", + "filename": "$count", + "root": { + "meta": { + "name": "$count" + } + }, + "items": [ + { + "type": "http", + "name": "Get number of deployments", + "filename": "Get number of deployments.bru", + "seq": 1, + "request": { + "url": "{{baseUrl}}/lm/deployments/$count?executableIds=T_jtbUJzwg0e.okSV667jeZejqVb,3e0cmfc4c-6YavNz92uztZE&configurationId=iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN&scenarioId=iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN&status=DEAD", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "", + "enabled": true + }, + { + "name": "Accept", + "value": "text/plain", + "enabled": true + } + ], + "params": [ + { + "name": "executableIds", + "value": "T_jtbUJzwg0e.okSV667jeZejqVb,3e0cmfc4c-6YavNz92uztZE", + "type": "query", + "enabled": true + }, + { + "name": "configurationId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": true + }, + { + "name": "scenarioId", + "value": "iiwMZ8.BjeF0SgmlZJM11XXkDUxP7Sg5GQLKEEsaWb.om5wMy1gN3AtN", + "type": "query", + "enabled": true + }, + { + "name": "status", + "value": "DEAD", + "type": "query", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "Retrieve the number of available deployments. The number can be filtered by\nscenarioId, configurationId, executableIdsList or by deployment status.\n", + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "credentialsPlacement": "basic_auth_header", + "pkce": false, + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + } + } + } + ] + } + ] + }, + { + "type": "folder", + "name": "metrics", + "filename": "metrics", + "root": { + "meta": { + "name": "metrics" + } + }, + "items": [ + { + "type": "http", + "name": "Evaluation Metrics via Execution ID", + "filename": "Evaluation Metrics via Execution ID.bru", + "seq": 4, + "request": { + "url": "{{baseUrl}}/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/child-of=", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "tagFilters", + "url": "{{baseUrl}}/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/child-of=", + "value": "evaluation.ai.sap.com/child-of=", + "type": "query", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + }, + { + "type": "http", + "name": "Metrics by Run Name", + "filename": "Metrics by Run Name.bru", + "seq": 5, + "request": { + "url": "{{baseUrl}}/v2/lm/metrics?tagFilters=evaluation.ai.sap.com/run-name=run1", + "method": "GET", + "headers": [ + { + "name": "AI-Resource-Group", + "value": "{{resource_group}}", + "enabled": true + }, + { + "name": "Accept", + "value": "application/json", + "enabled": true + } + ], + "params": [ + { + "name": "tagFilters", + "value": "evaluation.ai.sap.com/run-name=run1", + "type": "query", + "enabled": true + } + ], + "body": { + "mode": "none", + "formUrlEncoded": [], + "multipartForm": [], + "file": [] + }, + "script": {}, + "vars": {}, + "assertions": [], + "tests": "", + "docs": "", + "auth": { + "mode": "bearer", + "bearer": { + "token": "{{access_token}}" + } + } + } + } + ] + } + ] + } + ], + "activeEnvironmentUid": "lWUmIcEkGnkMxwNBILLmY", + "environments": [ + { + "variables": [ + { + "name": "ai_auth_url", + "value": "", + "enabled": true, + "secret": false, + "type": "text" + }, + { + "name": "ai_api_url", + "value": "", + "enabled": true, + "secret": false, + "type": "text" + }, + { + "name": "client_id", + "value": "", + "enabled": true, + "secret": false, + "type": "text" + }, + { + "name": "client_secret", + "value": "", + "enabled": true, + "secret": false, + "type": "text" + }, + { + "name": "resource_group", + "value": "", + "enabled": true, + "secret": false, + "type": "text" + }, + { + "name": "orchestration_service_url", + "value": "", + "enabled": true, + "secret": false, + "type": "text" + }, + { + "name": "access_token", + "value": "", + "enabled": true, + "secret": true, + "type": "text" + } + ], + "name": "intprod" + } + ], + "root": { + "request": { + "auth": { + "mode": "oauth2", + "oauth2": { + "grantType": "authorization_code", + "callbackUrl": "", + "authorizationUrl": "", + "accessTokenUrl": "", + "refreshTokenUrl": "", + "clientId": "", + "clientSecret": "", + "scope": "", + "state": "", + "pkce": false, + "credentialsPlacement": "basic_auth_header", + "credentialsId": "credentials", + "tokenPlacement": "header", + "tokenHeaderPrefix": "Bearer", + "tokenQueryKey": "access_token", + "autoFetchToken": true, + "autoRefreshToken": false + } + }, + "vars": { + "req": [ + { + "name": "region", + "value": "prod.eu-central-1.aws", + "enabled": true, + "local": false, + "uid": "oYVk4DuVpyYqqP2roBVjE" + }, + { + "name": "baseUrl", + "value": "", + "enabled": true, + "local": false, + "uid": "I4KjDm7FxpSRwUYzjwfPG" + }, + { + "name": "auth_url", + "value": "", + "enabled": true, + "local": false, + "uid": "zuftvyCURtA9XYErCYDgo" + }, + { + "name": "client_id", + "value": "", + "enabled": true, + "local": false, + "uid": "JfGEVKm71BYTgR8UkQUGv" + }, + { + "name": "client_secret", + "value": "", + "enabled": true, + "local": false, + "uid": "ls3RYTJ40baTl8eYmilGt" + }, + { + "name": "AWS_ACCESS_KEY_ID", + "value": "", + "enabled": true, + "local": false, + "uid": "2O0YTTAdmYltm5XiHMhP2" + }, + { + "name": "AWS_SECRET_ACCESS_KEY", + "value": "", + "enabled": true, + "local": false, + "uid": "8rc4RYyPcHXyTkAnnI981" + }, + { + "name": "BUCKET_NAME", + "value": "", + "enabled": true, + "local": false, + "uid": "HqFIe8Rvc14i41WIAGGkl" + }, + { + "name": "DATABASE_URL", + "value": "https://s3-eu-central-1.amazonaws.com", + "enabled": true, + "local": false, + "uid": "aWIwuJZH5XQ5Guu2D69Sq" + } + ] + } + }, + "docs": "Provides tools to manage your scenarios and workflows in SAP AI Core. Execute pipelines as a batch job, for example to pre-process or train your models, or perform batch inference. Serve inference requests of trained models. Deploy а trained machine learning model as a web service to serve inference requests with high performance. Register your own Docker registry, synchronize your AI content from your own git repository, and register your own object store for training data and trained models.\n", + "meta": { + "name": "AI Core" + } + }, + "brunoConfig": { + "version": "1", + "name": "AI Core", + "type": "collection", + "ignore": [ + "node_modules", + ".git" + ], + "size": 0.10747432708740234, + "filesCount": 151 + } +} diff --git a/tutorials/ai-core-genaihub-evaluation/img/image-br01.png b/tutorials/ai-core-genaihub-evaluation/img/image-br01.png new file mode 100644 index 000000000..5424ea51d Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image-br01.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image-br02.png b/tutorials/ai-core-genaihub-evaluation/img/image-br02.png new file mode 100644 index 000000000..4ed9d9ab0 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image-br02.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image-br03.png b/tutorials/ai-core-genaihub-evaluation/img/image-br03.png new file mode 100644 index 000000000..48b1474ce Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image-br03.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image-br04.png b/tutorials/ai-core-genaihub-evaluation/img/image-br04.png new file mode 100644 index 000000000..9f8a175e4 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image-br04.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image-br05.png b/tutorials/ai-core-genaihub-evaluation/img/image-br05.png new file mode 100644 index 000000000..69a105ef0 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image-br05.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image-br06.png b/tutorials/ai-core-genaihub-evaluation/img/image-br06.png new file mode 100644 index 000000000..81128b34b Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image-br06.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_007.png b/tutorials/ai-core-genaihub-evaluation/img/image_007.png new file mode 100644 index 000000000..0cdc4cf4a Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_007.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_008.png b/tutorials/ai-core-genaihub-evaluation/img/image_008.png new file mode 100644 index 000000000..2f12f021a Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_008.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_009.png b/tutorials/ai-core-genaihub-evaluation/img/image_009.png new file mode 100644 index 000000000..1c979c6b0 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_009.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_1.png b/tutorials/ai-core-genaihub-evaluation/img/image_1.png new file mode 100644 index 000000000..6db3eb05c Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_1.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_10.png b/tutorials/ai-core-genaihub-evaluation/img/image_10.png new file mode 100644 index 000000000..275de8254 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_10.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_19.png b/tutorials/ai-core-genaihub-evaluation/img/image_19.png new file mode 100644 index 000000000..3e302e4c7 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_19.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_20.png b/tutorials/ai-core-genaihub-evaluation/img/image_20.png new file mode 100644 index 000000000..ebf5c705e Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_20.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_21.png b/tutorials/ai-core-genaihub-evaluation/img/image_21.png new file mode 100644 index 000000000..dd9f9f22b Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_21.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_22.png b/tutorials/ai-core-genaihub-evaluation/img/image_22.png new file mode 100644 index 000000000..abcae67d6 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_22.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_23.png b/tutorials/ai-core-genaihub-evaluation/img/image_23.png new file mode 100644 index 000000000..97b0bc60f Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_23.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_24.png b/tutorials/ai-core-genaihub-evaluation/img/image_24.png new file mode 100644 index 000000000..5471c2e38 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_24.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_25.png b/tutorials/ai-core-genaihub-evaluation/img/image_25.png new file mode 100644 index 000000000..8b9af206b Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_25.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_26.png b/tutorials/ai-core-genaihub-evaluation/img/image_26.png new file mode 100644 index 000000000..d2166f1b2 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_26.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_27.png b/tutorials/ai-core-genaihub-evaluation/img/image_27.png new file mode 100644 index 000000000..c3472efdb Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_27.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_29.png b/tutorials/ai-core-genaihub-evaluation/img/image_29.png new file mode 100644 index 000000000..09c845eea Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_29.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_31.png b/tutorials/ai-core-genaihub-evaluation/img/image_31.png new file mode 100644 index 000000000..7a1a959fb Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_31.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_32.png b/tutorials/ai-core-genaihub-evaluation/img/image_32.png new file mode 100644 index 000000000..fe827f346 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_32.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_33.png b/tutorials/ai-core-genaihub-evaluation/img/image_33.png new file mode 100644 index 000000000..1ee321aa5 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_33.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_34.png b/tutorials/ai-core-genaihub-evaluation/img/image_34.png new file mode 100644 index 000000000..47498a7b0 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_34.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_40.png b/tutorials/ai-core-genaihub-evaluation/img/image_40.png new file mode 100644 index 000000000..87af4a4e8 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_40.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_41.png b/tutorials/ai-core-genaihub-evaluation/img/image_41.png new file mode 100644 index 000000000..28e12a546 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_41.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_43.png b/tutorials/ai-core-genaihub-evaluation/img/image_43.png new file mode 100644 index 000000000..d594ffa7c Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_43.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_44.png b/tutorials/ai-core-genaihub-evaluation/img/image_44.png new file mode 100644 index 000000000..8b352c79e Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_44.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_45.png b/tutorials/ai-core-genaihub-evaluation/img/image_45.png new file mode 100644 index 000000000..7cf1a3f63 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_45.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_46.png b/tutorials/ai-core-genaihub-evaluation/img/image_46.png new file mode 100644 index 000000000..eab460b4f Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_46.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_47.png b/tutorials/ai-core-genaihub-evaluation/img/image_47.png new file mode 100644 index 000000000..fc729b5ea Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_47.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_48.png b/tutorials/ai-core-genaihub-evaluation/img/image_48.png new file mode 100644 index 000000000..a7d8b132f Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_48.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_49.png b/tutorials/ai-core-genaihub-evaluation/img/image_49.png new file mode 100644 index 000000000..ec44bfad3 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_49.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_5.png b/tutorials/ai-core-genaihub-evaluation/img/image_5.png new file mode 100644 index 000000000..e30beaf9b Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_5.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_50.png b/tutorials/ai-core-genaihub-evaluation/img/image_50.png new file mode 100644 index 000000000..74fea1ca6 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_50.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_6.png b/tutorials/ai-core-genaihub-evaluation/img/image_6.png new file mode 100644 index 000000000..4a96b45b9 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_6.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_ail_or1.png b/tutorials/ai-core-genaihub-evaluation/img/image_ail_or1.png new file mode 100644 index 000000000..060af6b82 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_ail_or1.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_ail_or2.png b/tutorials/ai-core-genaihub-evaluation/img/image_ail_or2.png new file mode 100644 index 000000000..7ceaf7244 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_ail_or2.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_ail_or3.png b/tutorials/ai-core-genaihub-evaluation/img/image_ail_or3.png new file mode 100644 index 000000000..0b60b1541 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_ail_or3.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_br_dt.png b/tutorials/ai-core-genaihub-evaluation/img/image_br_dt.png new file mode 100644 index 000000000..841683c51 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_br_dt.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_br_or1.png b/tutorials/ai-core-genaihub-evaluation/img/image_br_or1.png new file mode 100644 index 000000000..8af37314e Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_br_or1.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_br_pr.png b/tutorials/ai-core-genaihub-evaluation/img/image_br_pr.png new file mode 100644 index 000000000..22d143968 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_br_pr.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_py03.png b/tutorials/ai-core-genaihub-evaluation/img/image_py03.png new file mode 100644 index 000000000..cace1aedb Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_py03.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_py_con.png b/tutorials/ai-core-genaihub-evaluation/img/image_py_con.png new file mode 100644 index 000000000..12bf2650b Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_py_con.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_py_or1.png b/tutorials/ai-core-genaihub-evaluation/img/image_py_or1.png new file mode 100644 index 000000000..0469ab08c Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_py_or1.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/image_py_rk.png b/tutorials/ai-core-genaihub-evaluation/img/image_py_rk.png new file mode 100644 index 000000000..d7e2b38c4 Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/img/image_py_rk.png differ diff --git a/tutorials/ai-core-genaihub-evaluation/img/requirements.txt b/tutorials/ai-core-genaihub-evaluation/img/requirements.txt new file mode 100644 index 000000000..c63e2f289 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/img/requirements.txt @@ -0,0 +1,7 @@ +generative-ai-hub-sdk==4.4.3 +python-dotenv==1.0.1 +boto3==1.37.4 +pandas==2.2.3 +json2html==1.3.0 +numpy==1.26.4 +ipywidgets==8.1.0 diff --git a/tutorials/ai-core-genaihub-evaluation/requirements.txt b/tutorials/ai-core-genaihub-evaluation/requirements.txt new file mode 100644 index 000000000..547492540 --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/requirements.txt @@ -0,0 +1,7 @@ +sap-ai-sdk-gen +python-dotenv==1.0.1 +boto3==1.37.4 +pandas==2.2.3 +json2html==1.3.0 +numpy==1.26.4 +ipywidgets==8.1.0 diff --git a/tutorials/ai-core-genaihub-evaluation/results-new/results.db b/tutorials/ai-core-genaihub-evaluation/results-new/results.db new file mode 100644 index 000000000..24d338d4c Binary files /dev/null and b/tutorials/ai-core-genaihub-evaluation/results-new/results.db differ diff --git a/tutorials/ai-core-genaihub-evaluation/sample.env b/tutorials/ai-core-genaihub-evaluation/sample.env new file mode 100644 index 000000000..0cf95eaef --- /dev/null +++ b/tutorials/ai-core-genaihub-evaluation/sample.env @@ -0,0 +1,12 @@ +# AICORE CREDENTIALS +AICORE_CLIENT_ID= +AICORE_CLIENT_SECRET=AICORE CLIENT SECRET> +AICORE_AUTH_URL= +AICORE_BASE_URL= +AICORE_RESOURCE_GROUP =default + +# AWS CREDENTIALS +AWS_ACCESS_KEY= +AWS_BUCKET_ID=> +AWS_REGION= +AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/tutorials/ai-core-genaihub-prompt-optimization/ai-core-genaihub-prompt-optimization.md b/tutorials/ai-core-genaihub-prompt-optimization/ai-core-genaihub-prompt-optimization.md index c93091929..446b664e5 100644 --- a/tutorials/ai-core-genaihub-prompt-optimization/ai-core-genaihub-prompt-optimization.md +++ b/tutorials/ai-core-genaihub-prompt-optimization/ai-core-genaihub-prompt-optimization.md @@ -9,20 +9,16 @@ author_profile: https://github.com/I321506 --- # Prompt optimization - This tutorial demonstrates how to use Prompt Optimization in SAP AI Core to automatically refine prompt templates using labeled datasets and evaluation metrics.The process optimizes a prompt for a specific model, stores metrics in the ML Tracking Service, and saves the optimized prompt and results back to the Prompt Registry and Object Store. + This tutorial demonstrates how to use Prompt Optimization in SAP AI Core to automatically refine prompt templates using labeled datasets and evaluation metrics. +The process optimizes a prompt for a specific model, stores metrics in the ML Tracking Service, and saves the optimized prompt and results back to the Prompt Registry and Object Store. ## You will learn - - How to prepare datasets and object stores for prompt optimization. - - How to create and register prompt templates in the Prompt Registry. - - How to configure and run prompt optimization via AI Launchpad, Bruno, and the Python SDK. - - How to monitor executions, review metrics, and save optimized prompts for reuse. ## Prerequisites - 1. **BTP Account** Set up your SAP Business Technology Platform (BTP) account. [Create a BTP Account](https://developers.sap.com/group.btp-setup.html) @@ -38,45 +34,34 @@ author_profile: https://github.com/I321506 [AI Core Setup Tutorial](https://developers.sap.com/tutorials/ai-core-setup.html) 6. An Extended SAP AI Core service plan is required, as the Generative AI Hub is not available in the Free or Standard tiers. For more details, refer to [SAP AI Core Service Plans](https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans?version=CLOUD) +7. You've prepared a prompt template and your template is available in the prompt registry. For more information, see [Save a Template](https://help.sap.com/docs/AI_LAUNCHPAD/3f71b1e9d5124e26ace1aa1edb11e450/49d4248485644184ab3ca2ddf36119a6.html?locale=en-US&state=DRAFT&version=DEV) -## Pre-Read - +### Pre-Read Before starting this tutorial, ensure that you: - - Understand the basics of Generative AI workflows in SAP AI Core. - -- Are familiar with creating and managing prompt templates, artifacts, and object stores - +- Are familiar with creating and managing prompt templates, artifacts, and object stores - Have the required roles such as genai_manager or custom_evaluation. - - Have completed the Quick Start tutorial or equivalent setup for SAP AI Core and AI Launchpad access. -## Architecture Overview +### Architecture Overview - Prompt Optimization in SAP AI Core connects the Prompt Registry, Object Store, and ML Tracking Service to form an end-to-end optimization workflow. - - The dataset (for example, Test-Data.json) is stored in the Object Store and registered as an artifact. - - During execution, the system uses the selected prompt template, metric, and model to evaluate multiple prompt variants. - - Metrics are tracked in the ML Tracking Service, and both the optimized prompt and results are saved back to the registry and object store. - - This process runs as an execution and is model-specific, ensuring the optimized prompt aligns with the target model’s behavior. + ![img](img/image_arch.png) -## Notebook Reference +### Notebook Reference For hands-on execution and end-to-end reference, use the accompanying [Prompt Optimization Notebook](https://github.com/SAP-samples/aicore-genai-samples/blob/main/genai-sample-apps/prompt-optimizer/prompt-optimizer.ipynb). It includes complete Python code examples that align with each step of this tutorial — from dataset preparation and artifact registration to configuration creation, execution, and result retrieval. - 💡 Even though this tutorial provides stepwise code snippets for clarity, - the notebook contains all required imports, object initializations, and helper functions to run the flow seamlessly in one place. +💡 Even though this tutorial provides stepwise code snippets for clarity, the notebook contains all required imports, object initializations, and helper functions to run the flow seamlessly in one place. **To use the notebook:** - - Download and open [notebook](https://github.com/SAP-samples/aicore-genai-samples/blob/main/genai-sample-apps/prompt-optimizer/prompt-optimizer.ipynb) in your preferred environment (e.g., VS Code, JupyterLab). - - Configure your environment variables such as AICORE_BASE_URL, AICORE_AUTH_TOKEN, and object store credentials . - - Execute each cell in order to reproduce the complete prompt optimization workflow demonstrated in this tutorial. ### Environment Variables Setup @@ -91,7 +76,7 @@ For hands-on execution and end-to-end reference, use the accompanying [Prompt Op - When prompted, enter your AI Core credentials (such as Client ID, Client Secret, and Base URL). - Note: If you're unsure about where to find these credentials, refer to this [guide](https://developers.sap.com/tutorials/ai-core-generative-ai.html#1c4f36d7-f345-4822-be00-c15f133ff7d8). -- Once the workspace is successfully created, select your desired Resource Group to begin the optimization process. +- Once the workspace is successfully created, select your desired Resource Group to begin the evaluation process. Refer to the screenshot below for guidance: ![img](img/image_34.png) @@ -110,7 +95,16 @@ AICORE_CLIENT_ID= AICORE_CLIENT_SECRET= AICORE_AUTH_URL= AICORE_BASE_URL= -AICORE_RESOURCE_GROUP= +AICORE_RESOURCE_GROUP= + +# AWS CREDENTIALS +AWS_ACCESS_KEY= +AWS_BUCKET_ID= +AWS_REGION= +AWS_SECRET_ACCESS_KEY= + +# ORCHESTRATION DEPLOYMENT URL +DEPLOYMENT_URL= ``` **Note:** Replace placeholders (e.g., CLIENT_ID, CLIENT_SECRET, etc) with your actual environment credentials. @@ -118,6 +112,40 @@ AICORE_RESOURCE_GROUP= Refer to the below screenshot for clarity: ![img](img/image_1.png) +#### Connect to AI Core Instance + +Once the environment variables are set and dependencies are installed, run the following code to connect to your instance: + +```PYTHON +# Loading the credentials from the env file +from gen_ai_hub.proxy.gen_ai_hub_proxy import GenAIHubProxyClient +from dotenv import load_dotenv +import os + +load_dotenv(override=True) + +# Fetching environment variables +AICORE_BASE_URL = os.getenv("AICORE_BASE_URL") +AICORE_RESOURCE_GROUP = os.getenv("AICORE_RESOURCE_GROUP") +AICORE_AUTH_URL = os.getenv("AICORE_AUTH_URL") +AICORE_CLIENT_ID = os.getenv("AICORE_CLIENT_ID") +AICORE_CLIENT_SECRET = os.getenv("AICORE_CLIENT_SECRET") + +AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") +AWS_BUCKET_ID = os.getenv("AWS_BUCKET_ID") +AWS_REGION = os.getenv("AWS_REGION") +AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") + +# Initializing the GenAIHubProxyClient +client = GenAIHubProxyClient( + base_url=AICORE_BASE_URL, + auth_url=AICORE_AUTH_URL, + client_id=AICORE_CLIENT_ID, + client_secret=AICORE_CLIENT_SECRET, + resource_group=AICORE_RESOURCE_GROUP +) +``` + [OPTION END] [OPTION BEGIN [Bruno]] @@ -152,7 +180,7 @@ In the **Secret** field, use the following structure to provide your AWS credent ``` [OPTION END] -[OPTION BEGIN [Python]] +[OPTION BEGIN [Python SDK]] If you’re running this tutorial in a Python environment and need to create a new S3-based object store, you can register it manually: @@ -248,9 +276,9 @@ Each record should contain a sample input message and its corresponding expected Each record must include: -- input – the user message or text prompt + - input – the user message or text prompt -- answer – the expected model response (in valid JSON format) + - answer – the expected model response (in valid JSON format) Example record from facility-train.json: @@ -259,18 +287,18 @@ Example record from facility-train.json: { "fields": { "input": "Subject: Urgent Assistance Required for Specialized Cleaning Services\n\nDear ProCare - Facility Solutions Support Team. Could you please arrange for a specialized cleaning team to - visit our home at the earliest convenience? We would greatly appreciate it if this could be - prioritized since we want to host a large party this week.\n\nThank you for your prompt - attention to this matter. We look forward to your swift response and assistance.\n\nBest - regards,\n[Sender]" + Facility Solutions Support Team. Could you please arrange for a specialized cleaning team to + visit our home at the earliest convenience? We would greatly appreciate it if this could be + prioritized since we want to host a large party this week.\n\nThank you for your prompt + attention to this matter. We look forward to your swift response and assistance.\n\nBest + regards,\n[Sender]" }, "answer": "{\"categories\": {\"routine_maintenance_requests\": false, - \"customer_feedback_and_complaints\": false, \"training_and_support_requests\": false, - \"quality_and_safety_concerns\": false, \"sustainability_and_environmental_practices\": false, - \"cleaning_services_scheduling\": false, \"specialized_cleaning_services\": true, - \"emergency_repair_services\": false, \"facility_management_issues\": false, - \"general_inquiries\": false}, \"sentiment\": \"neutral\", \"urgency\": \"high\"}" + \"customer_feedback_and_complaints\": false, \"training_and_support_requests\": false, + \"quality_and_safety_concerns\": false, \"sustainability_and_environmental_practices\": false, + \"cleaning_services_scheduling\": false, \"specialized_cleaning_services\": true, + \"emergency_repair_services\": false, \"facility_management_issues\": false, + \"general_inquiries\": false}, \"sentiment\": \"neutral\", \"urgency\": \"high\"}" }, {...} ] @@ -340,6 +368,16 @@ A wizard appears to guide you through the process of uploading an artifact for o - Specify the relative subpath for your file in the object store. +8. (Optional) Choose Add Labels to include key-value tags that describe your artifact. + + - Use the ➕ icon to add more labels or the ✖ icon to delete labels. + + **Example:** + + - Key: prompt-optimization + + - Value: true + 9. Review all information and choose Add to complete the artifact registration. ![img](img/image_ail01.png) @@ -454,7 +492,54 @@ After registration, the artifact will be visible in AI Launchpad → Workspaces [OPTION END] -[OPTION BEGIN [Bruno]] +[OPTION BEGIN [Bruno]] + +Before registering a dataset artifact in Bruno, you must upload your json file to the SAP AI Core object store using the Dataset API. +Bruno cannot upload files directly to S3; therefore, this step is required. + +**Prerequisites** + + - An object store secret must already exist in your resource group.Typically, this is the default secret named **default**. + + - The Dataset API currently supports: + + - S3 object stores only + + - json file uploads + +**Upload Your Dataset** + +Use the Dataset API – Upload File request in Bruno: + +```bash +PUT:{{ai_api_url}}/v2/lm/dataset/files/{{secretName}}/{{datasetPath}} +``` + +**Headers** + +```json +Authorization: Bearer {{token}} +AI-Resource-Group: {{resourceGroup}} +Content-Type: text/csv +``` + +**Body** + +Upload your .csv file directly as binary in Bruno’s Body + +Example Path Values: + + - secretName: default + + - datasetPath: dataset/facility-train.json + +![img](img/image_br_dt.png) + +**Note:** + +Save the ai://… URL — you will use this when creating the dataset artifact. + +**Register the Dataset Artifact** - Click on **Register artifact** under lm -> artifacts in bruno collection to register the artifact @@ -493,7 +578,6 @@ The template is registered in the Prompt Registry and later referenced by the op In the Message Blocks section: - Add a System and user role message: - ```json system: |- You are a helpful assistant. @@ -520,6 +604,7 @@ user: |- "facility_management_issues": } } + Your response must: - Contain only this JSON structure (no extra text). - Be valid JSON (parsable without errors). @@ -559,242 +644,46 @@ Go to Generative AI Hub → Prompt Management → Templates and confirm: In your notebook or Python environment, you can define and register the same template programmatically using the SAP Generative AI SDK. ```python -from logging import PlaceHolder -from pydantic import BaseModel -from typing import List -import re -import requests -import json - -class PromptTemplate(BaseModel): - role: str - content: str - - -class PromptTemplateSpec(BaseModel): - template: List[PromptTemplate] - - - @property - def placeholders(self): - placeholders = set() - pattern = re.compile(r'\{\{\s*\?\s*(\w+)\s*\}\}') - for message in self.template: - placeholders.update(pattern.findall(message.content)) - return placeholders - - @classmethod - def from_optimizer_result(cls, input_): - placeholders = input_["user_message_template_fields"] - def replace(msg): - for key in placeholders: - msg = msg.replace("{"+key+"}", "{{?"+ key + "}}") - return msg - - return cls( - template=[ - { - "role": "system", - "content": replace(input_["system_prompt"]), - },{ - "role": "user", - "content": replace(input_["user_message_template"]), - } - ] +from gen_ai_hub.prompt_registry.client import PromptTemplateClient +from gen_ai_hub.prompt_registry.models.prompt_template import PromptTemplateSpec, PromptTemplate + +# Initialize Prompt Registry Client +prompt_registry_client = PromptTemplateClient(proxy_client=client) + +prompt_template_spec = PromptTemplateSpec( + template=[ + PromptTemplate( + role="system", + content=( + "You are a helpful assistant." + ) + ), + PromptTemplate( + role="user", + content=( + """Giving the following message: + --- + {{?input}} + --- + Extract and return a json with the follwoing keys and values: + - "urgency" as one of `high`, `medium`, `low` + - "sentiment" as one of `negative`, `neutral`, `positive` + - "categories" Create a dictionary with categories as keys and boolean values (True/False), where the value indicates whether the category is one of the best matching support category tags from: `emergency_repair_services`, `routine_maintenance_requests`, `quality_and_safety_concerns`, `specialized_cleaning_services`, `general_inquiries`, `sustainability_and_environmental_practices`, `training_and_support_requests`, `cleaning_services_scheduling`, `customer_feedback_and_complaints`, `facility_management_issues` + Your complete message should be a valid json string that can be read directly and only contain the keys mentioned in the list above. Never enclose it in ```json...```, no newlines, no unnessacary whitespaces.""" + ) ) - - def escape_curly_brackets(self) -> "PromptTemplateSpec": - # 1. Hide each {{?key}} placeholder with a unique token - placeholder_pattern = re.compile(r'\{\{\s*\?\s*(\w+)\s*\}\}') - mapping = {} - counter = 1 - - def _hide(match): - nonlocal counter - token = f"__PLACEHOLDER_{counter}__" - mapping[token] = match.group(0) - counter += 1 - return token - - new_templates = [] - for msg in self.template: - # a) hide custom placeholders - hidden = placeholder_pattern.sub(_hide, msg.content) - # b) escape all remaining braces - escaped = hidden.replace('{', '{{').replace('}', '}}') - # c) restore the original placeholders - print(mapping) - for token, original in mapping.items(): - escaped = escaped.replace(token, original) - - new_templates.append(PromptTemplate(role=msg.role, content=escaped)) - - # return a fresh copy - return PromptTemplateSpec(template=new_templates) - - - -def fetch_prompt_template(prompt_template: str) -> PromptTemplateSpec: - headers = { - **client.request_header, - "Content-Type": "application/json", - } - url = f"{client.ai_core_client.base_url}/lm/promptTemplates" - scenario, sep, name = prompt_template.partition("/") - if sep: - name, sep, version = name.partition(":") - if sep: - body = {"name": name, - "version": version, - "scenario": scenario, - "includeSpec": True - } - response = requests.get(url, headers=headers, params=body) - response.raise_for_status() - response = response.json() - if response["count"] > 0: - response = response["resources"][0] - else: - raise ValueError(f"Prompt template {name} not found.") - else: - url += f"/{prompt_template}" - response = requests.get(url, headers=headers) - response.raise_for_status() - response = response.json() - return PromptTemplateSpec.model_validate(response["spec"]) - -def load_prompt_template(prompt: str | pathlib.Path | list | dict | PromptTemplateSpec) -> PromptTemplateSpec: - if isinstance(prompt, PromptTemplateSpec): - return prompt - if isinstance(prompt, (str, pathlib.Path)) and pathlib.Path(prompt).exists(): - with open(prompt, "r") as f: - prompt = yaml.safe_load(f) - elif isinstance(prompt, str): - return fetch_prompt_template(prompt) - if isinstance(prompt, dict): - # expect dict with keys "system" [optional] and "user" - messages = [] - if "system" in prompt: - messages.append({"role": "system", "content": prompt["system"]}) - messages.append({"role": "user", "content": prompt["user"]}) - return PromptTemplateSpec(template=messages) - elif isinstance(prompt, list): - # expect list of dicts with keys "role" and "content" - return PromptTemplateSpec(template=messages) - else: - raise ValueError("Prompt must be a string, Path, list or dict") - - -def push_prompt_template(prompt_template: PromptTemplateSpec, - prompt_template_name_registry: str, - prompt_template_version: str, - scenario: str, - update=False): - headers = { - **client.request_header, - "Content-Type": "application/json", - } - url = f"{client.ai_core_client.base_url}/lm/promptTemplates" - body = {"name": prompt_template_name_registry, - "version": prompt_template_version, - "scenario": scenario} - res = requests.get(url, headers=headers, params=body).json() - if res["count"] > 0 and not update: - print(f"Prompt template {prompt_template_name_registry} already exists. Use update=True to update.") - return res["resources"][0] - # Prepare body - - body["spec"] = prompt_template.model_dump() - # Prepare headers - response = requests.post(url, headers=headers, json=body) - # Handle response - if response.status_code == 201: - response = response.json() - elif response.status_code in (400, 409, 413): - # Return error details - raise requests.HTTPError(f"Upload failed ({response.status_code}): {response.text}") - else: - response.raise_for_status() - return response.json() - - -import re - -def convert_py_notation(template): - pattern = re.compile(r'\{\{\s*\?\s*(\w+)\s*\}\}') - return pattern.sub(lambda match: "{" + match.group(1) + "}", template) - - -def validate_prompt(prompt: PromptTemplateSpec): - values = {k: "???" for k in prompt.placeholders} - - for message in prompt.template: - if message.role == "user": - try: - convert_py_notation(message.content).format(**values) - except KeyError as err: - msg = ["Unexpected key error when running test formatting."] - msg += ["This is most likeyly due to unescaped curly brackets."] - msg += ["You can try fixing this by running `prompt = prompt.escape_curly_brackets()` and use the new prompt template."] - raise ValueError("\n".join(msg)) from err - return True - - - - -from rich.console import Console -from rich.highlighter import RegexHighlighter -from rich.theme import Theme -from rich.panel import Panel -from rich import print - -class TemplateHighlighter(RegexHighlighter): - """Apply style to anything that looks like an email.""" - - base_style = "template." - highlights = [r"(?P\{\{\s*\?[^\{\}\s]+\s*\}\})"] - -highlighter = TemplateHighlighter() -theme = Theme({"template.placeholder": "bold magenta", "example.email": "bold magenta"}) -console = Console(highlighter=highlighter, theme=theme) - - -def print_prompt_template(prompt_template: PromptTemplateSpec | str | pathlib.Path, addition: str | None = None): - - prompt_template = load_prompt_template(prompt_template) - addition = f' - {addition}' if addition else '' - - for message in prompt_template.template: - if message.role == "system": - console.print(Panel(highlighter(message.content), title="System Message" + addition, border_style="red")) - elif message.role == "user": - console.print(Panel(highlighter(message.content), title="User Message" + addition, border_style="green")) - else: - console.print(Panel(highlighter(message.content), title="Assistant Message" + addition)) -``` -```Python -base_prompt_template = "./facility_prompt.yaml" # local path to the prompt template or Prompt Repository identifier - - -prompt = load_prompt_template(base_prompt_template) # .escape_curly_brackets() if validation fails. -print_prompt_template(prompt) -print(f"Prompt template loaded successfully. Placeholders found are: {prompt.placeholders}") -assert validate_prompt(prompt) -``` -```Python -base_template = load_prompt_template(base_prompt_template) -prompt_template_name_registry, _, prompt_template_version = base_prompt_template_registry.partition(":") -prompt = push_prompt_template(prompt_template=base_template, - prompt_template_name_registry=prompt_template_name_registry, - prompt_template_version=prompt_template_version, - scenario=scenario, - update=False + ] ) -print(f"Prompt present in registry under id {prompt['id']}") +# Create prompt template in registry +template = prompt_registry_client.create_prompt_template( + scenario="genai-optimizations", + name="facility-json-template", + version="1.0.0", + prompt_template_spec=prompt_template_spec +) -print('\n\n=== Base Prompt ===') -print_prompt_template(prompt["id"]) +print(f"✅ Created Prompt Template with ID: {template.id}") ``` **Notes** @@ -952,23 +841,6 @@ def create_config(metric: str, return response.id -# Config parameters -scenario = "genai-optimizations" - -base_prompt_template_registry = "evaluate-base:0.0.1" # name:version for the template in the registry - -dataset_secret="default" # secret name in the object store you want to use to store the dataset -dataset_remote_path="datasets/facility-train.json" # remote path in the object store to store the dataset - -reference_model = "gpt-4o:2024-08-06" -# Dictionary of models to optimize with their corresponding prompt template names under which the optimized prompt should be stored in the registry -targets = { - "gemini-2.5-pro:001": "evaluate-base-gemini-2_5-pro:0.0.1" -} - -# Metric to use for optimization -metric = "JSON_Match" - # Create the configuration configuration_id = create_config( metric=metric, @@ -1041,16 +913,9 @@ When the execution completes, the optimized prompt and results will be stored au [OPTION BEGIN [SAP AI Launchpad]] -- Navigate to the **Generative AI Hub** in the SAP AI Core Launchpad. - -- In the left-side menu, click on **Configurations/create Execution**. - -- Click the **Create Execution** button to begin setting up a new prompt execution. - -![img](img/image_ail09.png) +Once you complete Review your inputs and click Create in the Register an Optimization Configuration step, the Optimization job starts automatically. -You can monitor the progress and results directly from the Launchpad UI under **Optimizations → Executions** -Once completed, you can inspect logs, review metrics, and view the optimized prompt details. +After the job reaches Completed status, you can inspect logs, review evaluation metrics, and view the optimized prompt details. [OPTION END] diff --git a/tutorials/ai-core-genaihub-prompt-optimization/facility_prompt copy.yaml b/tutorials/ai-core-genaihub-prompt-optimization/facility_prompt copy.yaml deleted file mode 100644 index e10ca3c8b..000000000 --- a/tutorials/ai-core-genaihub-prompt-optimization/facility_prompt copy.yaml +++ /dev/null @@ -1,13 +0,0 @@ -system: |- - You are a helpful assistant - -user: |- - Giving the following message: - --- - {{?input}} - --- - Extract and return a json with the follwoing keys and values: - - "urgency" as one of `high`, `medium`, `low` - - "sentiment" as one of `negative`, `neutral`, `positive` - - "categories" Create a dictionary with categories as keys and boolean values (True/False), where the value indicates whether the category is one of the best matching support category tags from: `emergency_repair_services`, `routine_maintenance_requests`, `quality_and_safety_concerns`, `specialized_cleaning_services`, `general_inquiries`, `sustainability_and_environmental_practices`, `training_and_support_requests`, `cleaning_services_scheduling`, `customer_feedback_and_complaints`, `facility_management_issues` - Your complete message should be a valid json string that can be read directly and only contain the keys mentioned in the list above. Never enclose it in ```json...```, no newlines, no unnessacary whitespaces. diff --git a/tutorials/ai-core-genaihub-prompt-optimization/img/image_br_dt.png b/tutorials/ai-core-genaihub-prompt-optimization/img/image_br_dt.png new file mode 100644 index 000000000..fc0a10d6b Binary files /dev/null and b/tutorials/ai-core-genaihub-prompt-optimization/img/image_br_dt.png differ diff --git a/tutorials/ai-core-genaihub-prompt-optimization/prompt-optimizer.ipynb b/tutorials/ai-core-genaihub-prompt-optimization/onboarding-tutorial.ipynb similarity index 100% rename from tutorials/ai-core-genaihub-prompt-optimization/prompt-optimizer.ipynb rename to tutorials/ai-core-genaihub-prompt-optimization/onboarding-tutorial.ipynb