Skip to content

Commit 1ad1cba

Browse files
committed
refactor: split up generate.ts file for better code organization
The `generate.ts` file is becoming too large, and we should split up the file for better readability and code navigation.
1 parent b135de0 commit 1ad1cba

File tree

4 files changed

+408
-389
lines changed

4 files changed

+408
-389
lines changed
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import PQueue from 'p-queue';
2+
import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
3+
import {Environment} from '../configuration/environment.js';
4+
import {
5+
AssessmentConfig,
6+
AssessmentResult,
7+
AttemptDetails,
8+
MultiStepPromptDefinition,
9+
PromptDefinition,
10+
} from '../shared-interfaces.js';
11+
import {EvalID} from './executors/executor.js';
12+
import {ProgressLogger} from '../progress/progress-logger.js';
13+
import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
14+
import {generateInitialFiles} from './generate-initial-files.js';
15+
import {generateUserJourneysForApp} from './user-journeys.js';
16+
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
17+
import {attemptBuildAndTest} from './build-serve-test-loop.js';
18+
import {rateGeneratedCode} from '../ratings/rate-code.js';
19+
import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
20+
21+
/**
22+
* Creates and executes a task to generate or load code for a given prompt,
23+
* attempt to build it, repair it if necessary, and assess its quality.
24+
*
25+
* This function handles both online (AI-generated) and local (file-based) code retrieval.
26+
* It manages build attempts and AI-driven repair cycles.
27+
*
28+
* @param evalID ID of the evaluation task.
29+
* @param env Environment for this evaluation.
30+
* @param model Name of the LLM to use.
31+
* @param rootPromptDef Definition of the root prompt being processed.
32+
* @param localMode A boolean indicating whether to load code from local files instead of generating it.
33+
* @param skipScreenshots Whether to skip taking screenshot of a running application.
34+
* @param outputDirectory Directory in which to generate the output. Convenient for debugging.
35+
* @param abortSignal Abort signal for when the evaluation task should be aborted.
36+
* @param skipAxeTesting Whether or not to skip Axe testing of the app.
37+
* @param enableUserJourneyTesting Whether to enable user journey testing of generated apps.
38+
* @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
39+
* @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
40+
*/
41+
export async function startEvaluationTask(
42+
config: AssessmentConfig,
43+
evalID: EvalID,
44+
env: Environment,
45+
ratingLlm: GenkitRunner,
46+
rootPromptDef: PromptDefinition | MultiStepPromptDefinition,
47+
abortSignal: AbortSignal,
48+
workerConcurrencyQueue: PQueue,
49+
progress: ProgressLogger,
50+
): Promise<AssessmentResult[]> {
51+
// Set up the project structure once for the root project.
52+
const {directory, cleanup} = await setupProjectStructure(
53+
env,
54+
rootPromptDef,
55+
progress,
56+
config.outputDirectory,
57+
);
58+
59+
const results: AssessmentResult[] = [];
60+
const defsToExecute = rootPromptDef.kind === 'single' ? [rootPromptDef] : rootPromptDef.steps;
61+
62+
for (const promptDef of defsToExecute) {
63+
const [fullPromptText, systemInstructions] = await Promise.all([
64+
env.getPrompt(promptDef.systemPromptType, promptDef.prompt, config.ragEndpoint),
65+
env.getPrompt(promptDef.systemPromptType, ''),
66+
]);
67+
68+
// Resolve the context files from the root. We need to do this after the project is set up
69+
// and for each sub-prompt, because the project will be augmented on each iteration.
70+
const contextFiles = await resolveContextFiles(promptDef.contextFilePatterns, directory);
71+
72+
// Generate the initial set of files through the LLM.
73+
const initialResponse = await generateInitialFiles(
74+
config,
75+
evalID,
76+
env,
77+
promptDef,
78+
{
79+
directory,
80+
systemInstructions,
81+
combinedPrompt: fullPromptText,
82+
executablePrompt: promptDef.prompt,
83+
},
84+
contextFiles,
85+
abortSignal,
86+
progress,
87+
);
88+
89+
const toolLogs = initialResponse.toolLogs ?? [];
90+
91+
if (!initialResponse) {
92+
progress.log(
93+
promptDef,
94+
'error',
95+
'Failed to generate initial code using AI. Skipping this app.',
96+
);
97+
await cleanup();
98+
break;
99+
}
100+
101+
try {
102+
// Write the generated files to disk.
103+
// Note: This can fail when the LLM e.g. produced a wrong file name that is too large,
104+
// and results in a file system error. Gracefully handle this so we can continue testing.
105+
// Write the generated files to disk within the project directory.
106+
await writeResponseFiles(directory, initialResponse.files, env, rootPromptDef.name);
107+
108+
// If we're in a multi-step prompt, also write out to dedicated directories
109+
// for each sub-prompt so that we can inspect the output along the way.
110+
if (rootPromptDef.kind === 'multi-step') {
111+
await writeResponseFiles(directory, initialResponse.files, env, promptDef.name);
112+
}
113+
} catch (e) {
114+
let details = `Error: ${e}`;
115+
116+
if ((e as Partial<Error>).stack) {
117+
details += (e as Error).stack;
118+
}
119+
120+
progress.log(
121+
promptDef,
122+
'error',
123+
'Failed to generate initial code using AI. Skipping this app.',
124+
details,
125+
);
126+
127+
await cleanup();
128+
break;
129+
}
130+
131+
const userJourneys = config.enableUserJourneyTesting
132+
? await generateUserJourneysForApp(
133+
ratingLlm,
134+
rootPromptDef.name,
135+
defsToExecute[0].prompt,
136+
initialResponse.files,
137+
abortSignal,
138+
)
139+
: undefined;
140+
141+
// TODO: Only execute the serve command on the "final working attempt".
142+
// TODO: Incorporate usage.
143+
const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = userJourneys
144+
? {
145+
userJourneys: userJourneys.result,
146+
appPrompt: defsToExecute[0].prompt,
147+
}
148+
: undefined;
149+
150+
const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
151+
152+
// Try to build the files in the root prompt directory.
153+
// This will also attempt to fix issues with the generated code.
154+
const attempt = await attemptBuildAndTest(
155+
config,
156+
evalID,
157+
env,
158+
rootPromptDef,
159+
directory,
160+
contextFiles,
161+
initialResponse,
162+
attemptDetails,
163+
abortSignal,
164+
workerConcurrencyQueue,
165+
progress,
166+
userJourneyAgentTaskInput,
167+
);
168+
169+
if (!attempt) {
170+
await cleanup();
171+
break;
172+
}
173+
174+
const score = await rateGeneratedCode(
175+
ratingLlm,
176+
env,
177+
promptDef,
178+
fullPromptText,
179+
attempt.outputFiles,
180+
attempt.buildResult,
181+
attempt.serveTestingResult,
182+
attempt.repairAttempts,
183+
attempt.axeRepairAttempts,
184+
abortSignal,
185+
progress,
186+
config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
187+
attempt.testResult ?? null,
188+
attempt.testRepairAttempts,
189+
);
190+
191+
results.push({
192+
promptDef: {
193+
// Note: we don't pass the prompt def along directly,
194+
// because it can contain data that cannot be encoded.
195+
name: promptDef.name,
196+
prompt: promptDef.prompt,
197+
},
198+
outputFiles: attempt.outputFiles,
199+
finalAttempt: attempt,
200+
score,
201+
repairAttempts: attempt.repairAttempts,
202+
attemptDetails,
203+
userJourneys: userJourneys,
204+
axeRepairAttempts: attempt.axeRepairAttempts,
205+
toolLogs,
206+
testResult: attempt.testResult ?? null,
207+
testRepairAttempts: attempt.testRepairAttempts,
208+
} satisfies AssessmentResult);
209+
}
210+
211+
await cleanup();
212+
return results;
213+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import {join} from 'node:path';
2+
import {LocalLlmGenerateFilesResponse} from '../codegen/llm-runner.js';
3+
import {Environment} from '../configuration/environment.js';
4+
import {ProgressLogger} from '../progress/progress-logger.js';
5+
import {
6+
AssessmentConfig,
7+
LlmContextFile,
8+
LlmGenerateFilesRequest,
9+
RootPromptDefinition,
10+
Usage,
11+
} from '../shared-interfaces.js';
12+
import {EvalID} from './executors/executor.js';
13+
import {LLM_OUTPUT_DIR} from '../configuration/constants.js';
14+
import {globSync} from 'tinyglobby';
15+
import {UserFacingError} from '../utils/errors.js';
16+
import {readFile} from 'node:fs/promises';
17+
import {createLlmResponseTokenUsageMessage} from './codegen.js';
18+
19+
/**
20+
* Generates the initial files for a prompt using an LLM.
21+
* @param evalID ID of the eval for which files are generated.
22+
* @param model Name of the model used for generation.
23+
* @param env Environment that is currently being run.
24+
* @param promptName Name of the prompt being generated.
25+
* @param fullPromptText Full prompt to send to the LLM, including system instructions.
26+
* @param contextFiles Files that should be passed as context to the LLM.
27+
* @param localMode Whether the script is running in local mode.
28+
* @param abortSignal Signal to fire when this process should be aborted.
29+
*/
30+
export async function generateInitialFiles(
31+
options: AssessmentConfig,
32+
evalID: EvalID,
33+
env: Environment,
34+
promptDef: RootPromptDefinition,
35+
codegenRequest: LlmGenerateFilesRequest,
36+
contextFiles: LlmContextFile[],
37+
abortSignal: AbortSignal,
38+
progress: ProgressLogger,
39+
): Promise<LocalLlmGenerateFilesResponse> {
40+
if (options.localMode) {
41+
const localFilesDirectory = join(LLM_OUTPUT_DIR, env.id, promptDef.name);
42+
const filePaths = globSync('**/*', {cwd: localFilesDirectory});
43+
44+
if (filePaths.length === 0) {
45+
throw new UserFacingError(`Could not find pre-existing files in ${localFilesDirectory}`);
46+
}
47+
48+
return {
49+
files: await Promise.all(
50+
filePaths.map(async filePath => ({
51+
filePath,
52+
code: await readFile(join(localFilesDirectory, filePath), 'utf8'),
53+
})),
54+
),
55+
usage: {
56+
inputTokens: 0,
57+
outputTokens: 0,
58+
} satisfies Usage,
59+
// TODO: We could also try save/restore reasoning locally.
60+
reasoning: '',
61+
toolLogs: [],
62+
};
63+
}
64+
65+
progress.log(promptDef, 'codegen', 'Generating code with AI');
66+
67+
const response = await env.executor.generateInitialFiles(
68+
evalID,
69+
codegenRequest,
70+
options.model,
71+
contextFiles,
72+
abortSignal,
73+
);
74+
75+
if (response.success) {
76+
progress.log(
77+
promptDef,
78+
'codegen',
79+
'Received AI code generation response',
80+
createLlmResponseTokenUsageMessage(response) ?? '',
81+
);
82+
} else {
83+
progress.log(promptDef, 'error', 'Failed to generate code with AI', response.errors.join(', '));
84+
}
85+
86+
if (!response.success) {
87+
throw new Error(`Initial file generation failed: ${response.errors.join('\n')}`);
88+
}
89+
90+
return {
91+
files: response.outputFiles!,
92+
usage: response.usage,
93+
reasoning: response.reasoning,
94+
toolLogs: response.toolLogs,
95+
};
96+
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
2+
import {Environment} from '../configuration/environment.js';
3+
import {redX} from '../reporting/format.js';
4+
import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
5+
import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interfaces.js';
6+
7+
/**
8+
* Prepares a summary of build statuses and score distributions from a list of assessment results
9+
* and also some extra metadata about the run.
10+
*/
11+
export async function prepareSummary(
12+
genkit: GenkitRunner,
13+
abortSignal: AbortSignal,
14+
model: string,
15+
env: Environment,
16+
assessments: AssessmentResult[],
17+
completionStats: CompletionStats,
18+
opts: {skipAiSummary?: boolean},
19+
): Promise<RunSummary> {
20+
let inputTokens = 0;
21+
let outputTokens = 0;
22+
let totalTokens = 0;
23+
24+
assessments.forEach(result => {
25+
// Incorporate usage from running raters.
26+
if (result.score.tokenUsage) {
27+
inputTokens += result.score.tokenUsage.inputTokens;
28+
outputTokens += result.score.tokenUsage.outputTokens;
29+
totalTokens += result.score.tokenUsage.totalTokens ?? 0;
30+
}
31+
32+
// Incorporate usage numbers from all generate + build attempts.
33+
result.attemptDetails.forEach(attempt => {
34+
if (attempt.usage) {
35+
inputTokens += attempt.usage.inputTokens ?? 0;
36+
outputTokens += attempt.usage.outputTokens ?? 0;
37+
totalTokens += attempt.usage.totalTokens ?? 0;
38+
}
39+
});
40+
});
41+
42+
let aiSummary: string | undefined = undefined;
43+
if (!opts.skipAiSummary) {
44+
try {
45+
const result = await summarizeReportWithAI(genkit, abortSignal, assessments);
46+
inputTokens += result.usage.inputTokens;
47+
outputTokens += result.usage.outputTokens;
48+
totalTokens += result.usage.totalTokens;
49+
aiSummary = result.responseHtml;
50+
} catch (e) {
51+
console.error(`${redX()} Failed to generate AI summary for report: ${e}`);
52+
if ((e as Partial<Error>).stack) {
53+
console.error((e as Error).stack);
54+
}
55+
}
56+
}
57+
58+
const executorInfo = await env.executor.getExecutorInfo?.();
59+
60+
return {
61+
model,
62+
environmentId: env.id,
63+
displayName: env.displayName,
64+
framework: {
65+
fullStackFramework: {
66+
id: env.fullStackFramework.id,
67+
displayName: env.fullStackFramework.displayName,
68+
},
69+
clientSideFramework: {
70+
id: env.clientSideFramework.id,
71+
displayName: env.clientSideFramework.displayName,
72+
},
73+
},
74+
aiSummary,
75+
completionStats: completionStats,
76+
usage: {
77+
inputTokens,
78+
outputTokens,
79+
totalTokens,
80+
},
81+
runner: {
82+
id: executorInfo.id,
83+
displayName: executorInfo.displayName,
84+
},
85+
} satisfies RunSummary;
86+
}

0 commit comments

Comments
 (0)