Make multiround benchmarking logs clearer

GlebSolovev · GlebSolovev · commit c62d09bf6ca8 · 2025-01-17T04:35:46.000+01:00
diff --git a/src/benchmark/framework/benchmarkingCore/executeBenchmarkingTask.ts b/src/benchmark/framework/benchmarkingCore/executeBenchmarkingTask.ts
@@ -122,11 +122,18 @@ export async function executeBenchmarkingTask(
                 nextGeneratedProofId: nextGeneratedProofId,
                 roundNumber: roundNumber,
             };
+            const parentProofDesc =
+                parentProof === undefined
+                    ? "generate proofs"
+                    : `proof to fix id: ${parentProof?.benchmarkedProof.generatedProofId}`;
+            const thisRoundLogger = itemLogger.createChildLoggerWithIdentifier(
+                `[round: ${roundNumber}, ${parentProofDesc}]`
+            );
             const result = await benchmarkSingleCompletionGeneration(
                 thisRoundGenerationArgs,
                 options,
                 modelsScheduler,
-                itemLogger,
+                thisRoundLogger,
                 proofsChecker,
                 abortSignal
             );
diff --git a/src/benchmark/framework/benchmarkingCore/executeBenchmarkingTaskUtils/errorHandling.ts b/src/benchmark/framework/benchmarkingCore/executeBenchmarkingTaskUtils/errorHandling.ts
@@ -3,10 +3,7 @@ import { ModelParams } from "../../../../llm/llmServices/modelParams";
 
 import { buildErrorCompleteLog } from "../../../../utils/errorsUtils";
 import { IllegalStateError } from "../../../../utils/throwErrors";
-import {
-    AsOneRecordLogsBuilder,
-    BenchmarkingLogger,
-} from "../../logging/benchmarkingLogger";
+import { BenchmarkingLogger } from "../../logging/benchmarkingLogger";
 import { BenchmarkingModelParams } from "../../structures/benchmarkingCore/benchmarkingModelParams";
 import { BenchmarkingOptions } from "../../structures/benchmarkingCore/benchmarkingOptions";
 import { FailFastAbortError } from "../../utils/asyncUtils/abortUtils";
@@ -82,13 +79,14 @@ export namespace ExecuteBenchmarkingTaskErrorHandlingUtils {
     }
 
     export function logCommonError(
-        e: any,
+        e: ExpectedError,
         itemLogger: BenchmarkingLogger,
         params: BenchmarkingModelParams<ModelParams>,
         options: BenchmarkingOptions,
         abortSignal: AbortSignal
     ) {
-        const logConclusion = (errorRecordLogger: AsOneRecordLogsBuilder) => {
+        const errorRecordLogger = itemLogger.asOneRecord();
+        const logConclusion = () => {
             if (options.failFast) {
                 if (abortSignal.aborted) {
                     errorRecordLogger.info(
@@ -107,22 +105,16 @@ export namespace ExecuteBenchmarkingTaskErrorHandlingUtils {
         };
 
         if (e instanceof BenchmarkingError) {
-            logConclusion(itemLogger.asOneRecord().error(e.message));
+            errorRecordLogger.error(e.message);
         } else if (e instanceof ConfigurationError) {
-            logConclusion(
-                itemLogger
-                    .asOneRecord()
-                    .error(
-                        `"${params.modelParams.modelId}" is configured incorrectly: ${e.message}`
-                    )
+            errorRecordLogger.error(
+                `"${params.modelParams.modelId}" is configured incorrectly: ${e.message}`
             );
         } else {
-            logConclusion(
-                itemLogger
-                    .asOneRecord()
-                    .error(`Error occurred:`)
-                    .error(buildErrorCompleteLog(e), "gray")
-            );
+            errorRecordLogger
+                .error(`Error occurred:`)
+                .error(buildErrorCompleteLog(e), "gray");
         }
+        logConclusion();
     }
 }
diff --git a/src/benchmark/framework/benchmarkingCore/executeBenchmarkingTaskUtils/logging.ts b/src/benchmark/framework/benchmarkingCore/executeBenchmarkingTaskUtils/logging.ts
@@ -28,14 +28,14 @@ export namespace ExecuteBenchmarkingTaskLoggingUtils {
                     .debug("First valid proof:")
                     .debug(roundResult.thisRoundValidProofs[0].asString);
             } else {
-                asOneRecordLogs.debug(
+                asOneRecordLogs.info(
                     `However, no valid proofs have been found ${heavyCrossMark}`
                 );
             }
             const generatedProofsIds = roundResult.generatedProofs
                 .map((proof) => `${proof.generatedProofId}`)
                 .join(", ");
-            asOneRecordLogs.debug(
+            asOneRecordLogs.info(
                 `Newly generated proofs id-s are: [${generatedProofsIds}]`
             );
             logElapsedTime();
diff --git a/src/benchmark/framework/benchmarkingCore/singleCompletionGeneration/benchmarkSingleCompletionGeneration.ts b/src/benchmark/framework/benchmarkingCore/singleCompletionGeneration/benchmarkSingleCompletionGeneration.ts
@@ -31,6 +31,7 @@ import {
 } from "../../../../utils/time";
 import { BenchmarkingLogger } from "../../logging/benchmarkingLogger";
 import { writeTeamCityStatisticsValue } from "../../logging/consoleWriteUtils";
+import { infinitySymbol } from "../../logging/specialSymbols";
 import { BenchmarkingModelParams } from "../../structures/benchmarkingCore/benchmarkingModelParams";
 import { BenchmarkingOptions } from "../../structures/benchmarkingCore/benchmarkingOptions";
 import {
@@ -129,15 +130,6 @@ export async function benchmarkSingleCompletionGeneration<
         logger,
         abortSignal
     );
-    logger
-        .asOneRecord()
-        .info(
-            `Successfully generated ${proofGenerationResult.generatedProofs.length} proof(s)`
-        )
-        .debug(
-            `Effective elapsed time: ${proofGenerationResult.effectiveElapsedTimeMillis} ms`,
-            "gray"
-        );
     const preparedProofs: [string, GeneratedProof, number][] =
         proofGenerationResult.generatedProofs.map(
             (generatedProof: GeneratedProof, index: number) => [
@@ -266,6 +258,12 @@ export async function benchmarkSingleCompletionGeneration<
     return result;
 }
 
+/**
+ * Prevents from buggy delay estimates:
+ * infinite cycle with zero delays might cause some troubles.
+ */
+export const minDelayMillis = 100;
+
 namespace RemoteConnectionErrorDelays {
     export const initialDelayMillis = 10_000;
     export const exponentialMultiplier = 2;
@@ -341,6 +339,7 @@ async function generateProofWithRetriesExclusively<
             generateProof,
             generationArgs.llmService,
             options,
+            generationArgs.roundNumber,
             logger,
             abortSignal
         );
@@ -353,19 +352,29 @@ async function generateProofWithRetriesMeasured(
     ) => Promise<GeneratedProof[]>,
     llmService: LLMService<any, any>,
     options: BenchmarkingOptions,
+    roundNumber: number,
     logger: BenchmarkingLogger,
     abortSignal: AbortSignal
 ): Promise<ProofGenerationResult> {
     let delayMillis = 0;
     let prevFailureIsConnectionError = false;
-    let attemptIndex = 0;
+    let attemptIndex = 1;
+    const maxAttemptsString = options.proofGenerationRetries ?? infinitySymbol;
 
     let totalTime = new TimeMark();
     while (true) {
+        const attemptLogger = logger.createChildLoggerWithIdentifier(
+            `[proof generation attempt ${attemptIndex}/${maxAttemptsString}]`
+        );
         // `options.proofGenerationRetries` might be undefined meaning the unlimited retries case
-        if (attemptIndex === options.proofGenerationRetries) {
+        if (attemptIndex - 1 === options.proofGenerationRetries) {
+            attemptLogger.error(
+                `max retries (${options.proofGenerationRetries}) has been reached`,
+                "default"
+            );
             throwBenchmarkingError(
-                `Proof generation failed: max retries (${options.proofGenerationRetries}) has been reached`
+                `Proof generation failed: max retries (${options.proofGenerationRetries}) `,
+                `has been reached at round ${roundNumber}`
             );
         }
         throwOnAbort(abortSignal);
@@ -386,10 +395,10 @@ async function generateProofWithRetriesMeasured(
             };
 
             const tokens = result.tokensSpentInTotal;
-            logger
+            attemptLogger
                 .asOneRecord()
-                .debug(
-                    `Attempt #${attemptIndex}, successfully generated proofs`
+                .info(
+                    `Successfully generated ${generatedProofs.length} proof(s)`
                 )
                 .debug(
                     `Tokens spent: ${tokens.tokensSpentInTotal} = ${tokens.promptTokens} (prompt) + ${tokens.generatedTokens} (generated answer)`
@@ -415,19 +424,23 @@ async function generateProofWithRetriesMeasured(
             const llmServiceError = e as LLMServiceError;
 
             if (llmServiceError instanceof ConfigurationError) {
-                logger.debug(
-                    `Attempt #${attemptIndex}, configuration error: ${llmServiceError.message}`
+                attemptLogger.error(
+                    `Configuration error: ${llmServiceError.message}`,
+                    "default"
                 );
                 throw llmServiceError;
             }
             if (llmServiceError instanceof GenerationFailedError) {
                 const estimatedTime =
                     llmService.estimateTimeToBecomeAvailable();
-                delayMillis = timeToMillis(estimatedTime);
-                logger
+                delayMillis = Math.max(
+                    timeToMillis(estimatedTime),
+                    minDelayMillis
+                );
+                attemptLogger
                     .asOneRecord()
                     .debug(
-                        `Attempt #${attemptIndex}, generation failed error: ${llmServiceError.message}`
+                        `Generation failed error: ${llmServiceError.message}`
                     )
                     .debug(
                         `Estimated time to become available: ${timeToString(estimatedTime)}`
@@ -441,10 +454,10 @@ async function generateProofWithRetriesMeasured(
                         RemoteConnectionErrorDelays.initialDelayMillis;
                     prevFailureIsConnectionError = true;
                 }
-                logger
+                attemptLogger
                     .asOneRecord()
                     .debug(
-                        `Attempt #${attemptIndex}, remote connection error: ${stringifyAnyValue(llmServiceError.message)}`
+                        `Remote connection error: ${stringifyAnyValue(llmServiceError.message)}`
                     )
                     .debug(`Delay to wait for: ${millisToString(delayMillis)}`);
             } else {
diff --git a/src/benchmark/framework/logging/specialSymbols.ts b/src/benchmark/framework/logging/specialSymbols.ts
@@ -1,2 +1,3 @@
 export const heavyCheckMark = "\u2714";
 export const heavyCrossMark = "\u2718";
+export const infinitySymbol = "\u221E";
diff --git a/src/benchmark/framework/structures/benchmarkingResults/benchmarkedItem.ts b/src/benchmark/framework/structures/benchmarkingResults/benchmarkedItem.ts
@@ -118,6 +118,7 @@ abstract class AbstractBenchmarkedCompletionGeneration<
         );
     }
 
+    // TODO: can be cached and updated only on linking
     isSuccessfulCompletion(): boolean {
         return this.getAllValidProofs().length > 0;
     }

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`export const heavyCheckMark = "\u2714";`
`2`	`2`	`export const heavyCrossMark = "\u2718";`
	`3`	`+export const infinitySymbol = "\u221E";`
Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ abstract class AbstractBenchmarkedCompletionGeneration<`
`118`	`118`	`);`
`119`	`119`	`}`
`120`	`120`
	`121`	`+ // TODO: can be cached and updated only on linking`
`121`	`122`	`isSuccessfulCompletion(): boolean {`
`122`	`123`	`return this.getAllValidProofs().length > 0;`
`123`	`124`	`}`