refactor: split up generate.ts file for better code organization

devversion · devversion · commit 1ad1cbaa508c · 2025-10-14T15:34:27.000+02:00
The `generate.ts` file is becoming too large, and we should split up the
file for better readability and code navigation.
diff --git a/runner/orchestration/generate-eval-task.ts b/runner/orchestration/generate-eval-task.ts
@@ -0,0 +1,213 @@
+import PQueue from 'p-queue';
+import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
+import {Environment} from '../configuration/environment.js';
+import {
+  AssessmentConfig,
+  AssessmentResult,
+  AttemptDetails,
+  MultiStepPromptDefinition,
+  PromptDefinition,
+} from '../shared-interfaces.js';
+import {EvalID} from './executors/executor.js';
+import {ProgressLogger} from '../progress/progress-logger.js';
+import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
+import {generateInitialFiles} from './generate-initial-files.js';
+import {generateUserJourneysForApp} from './user-journeys.js';
+import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
+import {attemptBuildAndTest} from './build-serve-test-loop.js';
+import {rateGeneratedCode} from '../ratings/rate-code.js';
+import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
+
+/**
+ * Creates and executes a task to generate or load code for a given prompt,
+ * attempt to build it, repair it if necessary, and assess its quality.
+ *
+ * This function handles both online (AI-generated) and local (file-based) code retrieval.
+ * It manages build attempts and AI-driven repair cycles.
+ *
+ * @param evalID ID of the evaluation task.
+ * @param env Environment for this evaluation.
+ * @param model Name of the LLM to use.
+ * @param rootPromptDef Definition of the root prompt being processed.
+ * @param localMode A boolean indicating whether to load code from local files instead of generating it.
+ * @param skipScreenshots Whether to skip taking screenshot of a running application.
+ * @param outputDirectory Directory in which to generate the output. Convenient for debugging.
+ * @param abortSignal Abort signal for when the evaluation task should be aborted.
+ * @param skipAxeTesting Whether or not to skip Axe testing of the app.
+ * @param enableUserJourneyTesting Whether to enable user journey testing of generated apps.
+ * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
+ * @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
+ */
+export async function startEvaluationTask(
+  config: AssessmentConfig,
+  evalID: EvalID,
+  env: Environment,
+  ratingLlm: GenkitRunner,
+  rootPromptDef: PromptDefinition | MultiStepPromptDefinition,
+  abortSignal: AbortSignal,
+  workerConcurrencyQueue: PQueue,
+  progress: ProgressLogger,
+): Promise<AssessmentResult[]> {
+  // Set up the project structure once for the root project.
+  const {directory, cleanup} = await setupProjectStructure(
+    env,
+    rootPromptDef,
+    progress,
+    config.outputDirectory,
+  );
+
+  const results: AssessmentResult[] = [];
+  const defsToExecute = rootPromptDef.kind === 'single' ? [rootPromptDef] : rootPromptDef.steps;
+
+  for (const promptDef of defsToExecute) {
+    const [fullPromptText, systemInstructions] = await Promise.all([
+      env.getPrompt(promptDef.systemPromptType, promptDef.prompt, config.ragEndpoint),
+      env.getPrompt(promptDef.systemPromptType, ''),
+    ]);
+
+    // Resolve the context files from the root. We need to do this after the project is set up
+    // and for each sub-prompt, because the project will be augmented on each iteration.
+    const contextFiles = await resolveContextFiles(promptDef.contextFilePatterns, directory);
+
+    // Generate the initial set of files through the LLM.
+    const initialResponse = await generateInitialFiles(
+      config,
+      evalID,
+      env,
+      promptDef,
+      {
+        directory,
+        systemInstructions,
+        combinedPrompt: fullPromptText,
+        executablePrompt: promptDef.prompt,
+      },
+      contextFiles,
+      abortSignal,
+      progress,
+    );
+
+    const toolLogs = initialResponse.toolLogs ?? [];
+
+    if (!initialResponse) {
+      progress.log(
+        promptDef,
+        'error',
+        'Failed to generate initial code using AI. Skipping this app.',
+      );
+      await cleanup();
+      break;
+    }
+
+    try {
+      // Write the generated files to disk.
+      // Note: This can fail when the LLM e.g. produced a wrong file name that is too large,
+      // and results in a file system error. Gracefully handle this so we can continue testing.
+      // Write the generated files to disk within the project directory.
+      await writeResponseFiles(directory, initialResponse.files, env, rootPromptDef.name);
+
+      // If we're in a multi-step prompt, also write out to dedicated directories
+      // for each sub-prompt so that we can inspect the output along the way.
+      if (rootPromptDef.kind === 'multi-step') {
+        await writeResponseFiles(directory, initialResponse.files, env, promptDef.name);
+      }
+    } catch (e) {
+      let details = `Error: ${e}`;
+
+      if ((e as Partial<Error>).stack) {
+        details += (e as Error).stack;
+      }
+
+      progress.log(
+        promptDef,
+        'error',
+        'Failed to generate initial code using AI. Skipping this app.',
+        details,
+      );
+
+      await cleanup();
+      break;
+    }
+
+    const userJourneys = config.enableUserJourneyTesting
+      ? await generateUserJourneysForApp(
+          ratingLlm,
+          rootPromptDef.name,
+          defsToExecute[0].prompt,
+          initialResponse.files,
+          abortSignal,
+        )
+      : undefined;
+
+    // TODO: Only execute the serve command on the "final working attempt".
+    // TODO: Incorporate usage.
+    const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = userJourneys
+      ? {
+          userJourneys: userJourneys.result,
+          appPrompt: defsToExecute[0].prompt,
+        }
+      : undefined;
+
+    const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
+
+    // Try to build the files in the root prompt directory.
+    // This will also attempt to fix issues with the generated code.
+    const attempt = await attemptBuildAndTest(
+      config,
+      evalID,
+      env,
+      rootPromptDef,
+      directory,
+      contextFiles,
+      initialResponse,
+      attemptDetails,
+      abortSignal,
+      workerConcurrencyQueue,
+      progress,
+      userJourneyAgentTaskInput,
+    );
+
+    if (!attempt) {
+      await cleanup();
+      break;
+    }
+
+    const score = await rateGeneratedCode(
+      ratingLlm,
+      env,
+      promptDef,
+      fullPromptText,
+      attempt.outputFiles,
+      attempt.buildResult,
+      attempt.serveTestingResult,
+      attempt.repairAttempts,
+      attempt.axeRepairAttempts,
+      abortSignal,
+      progress,
+      config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
+      attempt.testResult ?? null,
+      attempt.testRepairAttempts,
+    );
+
+    results.push({
+      promptDef: {
+        // Note: we don't pass the prompt def along directly,
+        // because it can contain data that cannot be encoded.
+        name: promptDef.name,
+        prompt: promptDef.prompt,
+      },
+      outputFiles: attempt.outputFiles,
+      finalAttempt: attempt,
+      score,
+      repairAttempts: attempt.repairAttempts,
+      attemptDetails,
+      userJourneys: userJourneys,
+      axeRepairAttempts: attempt.axeRepairAttempts,
+      toolLogs,
+      testResult: attempt.testResult ?? null,
+      testRepairAttempts: attempt.testRepairAttempts,
+    } satisfies AssessmentResult);
+  }
+
+  await cleanup();
+  return results;
+}
diff --git a/runner/orchestration/generate-initial-files.ts b/runner/orchestration/generate-initial-files.ts
@@ -0,0 +1,96 @@
+import {join} from 'node:path';
+import {LocalLlmGenerateFilesResponse} from '../codegen/llm-runner.js';
+import {Environment} from '../configuration/environment.js';
+import {ProgressLogger} from '../progress/progress-logger.js';
+import {
+  AssessmentConfig,
+  LlmContextFile,
+  LlmGenerateFilesRequest,
+  RootPromptDefinition,
+  Usage,
+} from '../shared-interfaces.js';
+import {EvalID} from './executors/executor.js';
+import {LLM_OUTPUT_DIR} from '../configuration/constants.js';
+import {globSync} from 'tinyglobby';
+import {UserFacingError} from '../utils/errors.js';
+import {readFile} from 'node:fs/promises';
+import {createLlmResponseTokenUsageMessage} from './codegen.js';
+
+/**
+ * Generates the initial files for a prompt using an LLM.
+ * @param evalID ID of the eval for which files are generated.
+ * @param model Name of the model used for generation.
+ * @param env Environment that is currently being run.
+ * @param promptName Name of the prompt being generated.
+ * @param fullPromptText Full prompt to send to the LLM, including system instructions.
+ * @param contextFiles Files that should be passed as context to the LLM.
+ * @param localMode Whether the script is running in local mode.
+ * @param abortSignal Signal to fire when this process should be aborted.
+ */
+export async function generateInitialFiles(
+  options: AssessmentConfig,
+  evalID: EvalID,
+  env: Environment,
+  promptDef: RootPromptDefinition,
+  codegenRequest: LlmGenerateFilesRequest,
+  contextFiles: LlmContextFile[],
+  abortSignal: AbortSignal,
+  progress: ProgressLogger,
+): Promise<LocalLlmGenerateFilesResponse> {
+  if (options.localMode) {
+    const localFilesDirectory = join(LLM_OUTPUT_DIR, env.id, promptDef.name);
+    const filePaths = globSync('**/*', {cwd: localFilesDirectory});
+
+    if (filePaths.length === 0) {
+      throw new UserFacingError(`Could not find pre-existing files in ${localFilesDirectory}`);
+    }
+
+    return {
+      files: await Promise.all(
+        filePaths.map(async filePath => ({
+          filePath,
+          code: await readFile(join(localFilesDirectory, filePath), 'utf8'),
+        })),
+      ),
+      usage: {
+        inputTokens: 0,
+        outputTokens: 0,
+      } satisfies Usage,
+      // TODO: We could also try save/restore reasoning locally.
+      reasoning: '',
+      toolLogs: [],
+    };
+  }
+
+  progress.log(promptDef, 'codegen', 'Generating code with AI');
+
+  const response = await env.executor.generateInitialFiles(
+    evalID,
+    codegenRequest,
+    options.model,
+    contextFiles,
+    abortSignal,
+  );
+
+  if (response.success) {
+    progress.log(
+      promptDef,
+      'codegen',
+      'Received AI code generation response',
+      createLlmResponseTokenUsageMessage(response) ?? '',
+    );
+  } else {
+    progress.log(promptDef, 'error', 'Failed to generate code with AI', response.errors.join(', '));
+  }
+
+  if (!response.success) {
+    throw new Error(`Initial file generation failed: ${response.errors.join('\n')}`);
+  }
+
+  return {
+    files: response.outputFiles!,
+    usage: response.usage,
+    reasoning: response.reasoning,
+    toolLogs: response.toolLogs,
+  };
+}
diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts
@@ -0,0 +1,86 @@
+import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
+import {Environment} from '../configuration/environment.js';
+import {redX} from '../reporting/format.js';
+import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
+import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interfaces.js';
+
+/**
+ * Prepares a summary of build statuses and score distributions from a list of assessment results
+ * and also some extra metadata about the run.
+ */
+export async function prepareSummary(
+  genkit: GenkitRunner,
+  abortSignal: AbortSignal,
+  model: string,
+  env: Environment,
+  assessments: AssessmentResult[],
+  completionStats: CompletionStats,
+  opts: {skipAiSummary?: boolean},
+): Promise<RunSummary> {
+  let inputTokens = 0;
+  let outputTokens = 0;
+  let totalTokens = 0;
+
+  assessments.forEach(result => {
+    // Incorporate usage from running raters.
+    if (result.score.tokenUsage) {
+      inputTokens += result.score.tokenUsage.inputTokens;
+      outputTokens += result.score.tokenUsage.outputTokens;
+      totalTokens += result.score.tokenUsage.totalTokens ?? 0;
+    }
+
+    // Incorporate usage numbers from all generate + build attempts.
+    result.attemptDetails.forEach(attempt => {
+      if (attempt.usage) {
+        inputTokens += attempt.usage.inputTokens ?? 0;
+        outputTokens += attempt.usage.outputTokens ?? 0;
+        totalTokens += attempt.usage.totalTokens ?? 0;
+      }
+    });
+  });
+
+  let aiSummary: string | undefined = undefined;
+  if (!opts.skipAiSummary) {
+    try {
+      const result = await summarizeReportWithAI(genkit, abortSignal, assessments);
+      inputTokens += result.usage.inputTokens;
+      outputTokens += result.usage.outputTokens;
+      totalTokens += result.usage.totalTokens;
+      aiSummary = result.responseHtml;
+    } catch (e) {
+      console.error(`${redX()} Failed to generate AI summary for report: ${e}`);
+      if ((e as Partial<Error>).stack) {
+        console.error((e as Error).stack);
+      }
+    }
+  }
+
+  const executorInfo = await env.executor.getExecutorInfo?.();
+
+  return {
+    model,
+    environmentId: env.id,
+    displayName: env.displayName,
+    framework: {
+      fullStackFramework: {
+        id: env.fullStackFramework.id,
+        displayName: env.fullStackFramework.displayName,
+      },
+      clientSideFramework: {
+        id: env.clientSideFramework.id,
+        displayName: env.clientSideFramework.displayName,
+      },
+    },
+    aiSummary,
+    completionStats: completionStats,
+    usage: {
+      inputTokens,
+      outputTokens,
+      totalTokens,
+    },
+    runner: {
+      id: executorInfo.id,
+      displayName: executorInfo.displayName,
+    },
+  } satisfies RunSummary;
+}
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts