Skip to content

Commit 0c1605e

Browse files
authored
feat: AsyncDownload增加日志,方便排查请求慢问题 (#379)
* AsyncDownload请求慢排查 * add nolint
1 parent 65b37a9 commit 0c1605e

File tree

2 files changed

+106
-11
lines changed

2 files changed

+106
-11
lines changed

cmd/feed-server/bll/asyncdownload/asyncdownload.go

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -192,63 +192,103 @@ func (ad *Service) GetAsyncDownloadTaskStatus(kt *kit.Kit, bizID uint32, taskID
192192
return task.Status, nil
193193
}
194194

195+
// nolint
195196
func (ad *Service) upsertAsyncDownloadJob(kt *kit.Kit, bizID, appID uint32, filePath, fileName,
196197
targetAgentID, targetContainerID, targetUser, targetDir, signature string) (string, error) {
197-
fileKey := fmt.Sprintf("AsyncDownloadJob:%d:%d:%s:*", bizID, appID, path.Join(filePath, fileName))
198-
// lock by file to prevent concurrency writing in other requests
198+
startTime := time.Now()
199+
rid := kt.Rid
200+
fullPath := path.Join(filePath, fileName)
201+
fileKey := fmt.Sprintf("AsyncDownloadJob:%d:%d:%s:*", bizID, appID, fullPath)
202+
203+
// 获取外层锁(按文件路径)- 关键:记录等待锁的时间
204+
outerLockStart := time.Now()
199205
ad.redLock.Acquire(fileKey)
206+
outerLockWaitDuration := time.Since(outerLockStart)
207+
if outerLockWaitDuration > 100*time.Millisecond {
208+
logs.Warnf("[upsertAsyncDownloadJob] outer lock wait time long, rid: %s, biz_id: %d, file: %s, wait_duration: %v",
209+
rid, bizID, fullPath, outerLockWaitDuration)
210+
}
200211
defer ad.redLock.Release(fileKey)
212+
213+
// 查找现有 job
214+
keysStart := time.Now()
201215
keys, err := ad.cs.Redis().Keys(kt.Ctx, fileKey)
216+
keysDuration := time.Since(keysStart)
202217
if err != nil {
218+
logs.Errorf("[upsertAsyncDownloadJob] redis keys failed, rid: %s, biz_id: %d, file: %s, duration: %v, err: %v",
219+
rid, bizID, fullPath, keysDuration, err)
203220
return "", err
204221
}
222+
if keysDuration > 100*time.Millisecond {
223+
logs.Warnf("[upsertAsyncDownloadJob] redis keys slow, rid: %s, biz_id: %d, file: %s, found_keys: %d, duration: %v",
224+
rid, bizID, fullPath, len(keys), keysDuration)
225+
}
226+
205227
for _, key := range keys {
206228
if jobID, ok := func() (string, bool) {
207-
// lock by job to prevent concurrency writing in job manager
229+
// 获取内层锁(按 job ID)- 关键:记录等待锁的时间
230+
innerLockStart := time.Now()
208231
ad.redLock.Acquire(key)
232+
innerLockWaitDuration := time.Since(innerLockStart)
233+
if innerLockWaitDuration > 100*time.Millisecond {
234+
logs.Warnf("[upsertAsyncDownloadJob] inner lock wait time long, rid: %s, biz_id: %d, job_key: %s, wait_duration: %v",
235+
rid, bizID, key, innerLockWaitDuration)
236+
}
209237
defer ad.redLock.Release(key)
238+
210239
data, e := ad.cs.Redis().Get(kt.Ctx, key)
211240
if e != nil {
212-
logs.Errorf("get key %s from redis failed, err %s", key, e.Error())
241+
logs.Errorf("[upsertAsyncDownloadJob] redis get failed, rid: %s, biz_id: %d, job_key: %s, err: %v",
242+
rid, bizID, key, e)
213243
return "", false
214244
}
215245
if data == "" {
216246
return "", false
217247
}
218248
job := &types.AsyncDownloadJob{}
219249
if e := jsoni.UnmarshalFromString(data, &job); e != nil {
220-
logs.Errorf("unmarshal job %s failed, err %s", key, e.Error())
250+
logs.Errorf("[upsertAsyncDownloadJob] unmarshal job failed, rid: %s, biz_id: %d, job_key: %s, err: %v",
251+
rid, bizID, key, e)
221252
return "", false
222253
}
223254
if job.Status == types.AsyncDownloadJobStatusPending {
224255
// pending job exists, update it
256+
updateStart := time.Now()
225257
job.Targets = append(job.Targets, &types.AsyncDownloadTarget{
226258
AgentID: targetAgentID,
227259
ContainerID: targetContainerID,
228260
})
229261
js, e := jsoni.Marshal(job)
230262
if e != nil {
231-
logs.Errorf("marshal job %s failed, err %s", key, e.Error())
263+
logs.Errorf("[upsertAsyncDownloadJob] marshal job failed, rid: %s, biz_id: %d, job_key: %s, err: %v",
264+
rid, bizID, key, e)
232265
return "", false
233266
}
234267
if e := ad.cs.Redis().Set(kt.Ctx, key, string(js), 30*60); e != nil {
235-
logs.Errorf("set job %s to redis failed, err %s", key, e.Error())
268+
logs.Errorf("[upsertAsyncDownloadJob] redis set failed, rid: %s, biz_id: %d, job_key: %s, err: %v",
269+
rid, bizID, key, e)
236270
return "", false
237271
}
272+
updateDuration := time.Since(updateStart)
273+
totalDuration := time.Since(startTime)
274+
logs.Infof("[upsertAsyncDownloadJob] update existing job, rid: %s, biz_id: %d, job_id: %s, targets_count: %d, update_duration: %v, total_duration: %v",
275+
rid, bizID, key, len(job.Targets), updateDuration, totalDuration)
238276
return key, true
239277
}
240278
return "", false
241279

242280
}(); ok {
243-
logs.Infof("async download, job %s exists, update it", jobID)
281+
logs.Infof("[upsertAsyncDownloadJob] update existing job, rid: %s, biz_id: %d, job_id: %s",
282+
rid, bizID, jobID)
244283
return jobID, nil
245284
}
246285
}
247286

248287
// no pendeing job exists, create a new one
249288
// it's not possible to create two same job in same time, so use time stamp as unique id would be friendly.
289+
createStart := time.Now()
250290
jobID := fmt.Sprintf("AsyncDownloadJob:%d:%d:%s:%s", bizID, appID,
251-
path.Join(filePath, fileName), time.Now().Format("20060102150405"))
291+
fullPath, time.Now().Format("20060102150405"))
252292
job := &types.AsyncDownloadJob{
253293
JobID: jobID,
254294
BizID: bizID,
@@ -273,15 +313,28 @@ func (ad *Service) upsertAsyncDownloadJob(kt *kit.Kit, bizID, appID uint32, file
273313
}
274314
js, err := jsoni.Marshal(job)
275315
if err != nil {
316+
logs.Errorf("[upsertAsyncDownloadJob] marshal new job failed, rid: %s, biz_id: %d, job_id: %s, err: %v",
317+
rid, bizID, jobID, err)
276318
return "", err
277319
}
278320

279321
ad.metric.jobCounter.With(prm.Labels{"biz": strconv.Itoa(int(job.BizID)),
280322
"app": strconv.Itoa(int(job.AppID)), "file": path.Join(job.FilePath, job.FileName),
281323
"targets": strconv.Itoa(len(job.Targets)), "status": job.Status}).Inc()
282324

283-
logs.Infof("async download, create new job %s", jobID)
284-
return jobID, ad.cs.Redis().Set(kt.Ctx, jobID, string(js), 30*60)
325+
err = ad.cs.Redis().Set(kt.Ctx, jobID, string(js), 30*60)
326+
if err != nil {
327+
logs.Errorf("[upsertAsyncDownloadJob] redis set new job failed, rid: %s, biz_id: %d, job_id: %s, err: %v",
328+
rid, bizID, jobID, err)
329+
return "", err
330+
}
331+
332+
createDuration := time.Since(createStart)
333+
totalDuration := time.Since(startTime)
334+
logs.Infof("[upsertAsyncDownloadJob] create new job, rid: %s, biz_id: %d, job_id: %s, create_duration: %v, total_duration: %v (outer_lock_wait: %v, keys_duration: %v)",
335+
rid, bizID, jobID, createDuration, totalDuration, outerLockWaitDuration, keysDuration)
336+
337+
return jobID, nil
285338
}
286339

287340
func (ad *Service) upsertAsyncDownloadTask(ctx context.Context, taskID string,

cmd/feed-server/service/rpc_sidecar.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,60 +720,102 @@ func (s *Service) ListApps(ctx context.Context, req *pbfs.ListAppsReq) (*pbfs.Li
720720
}
721721

722722
// AsyncDownload 异步 p2p 下载,文件名为 sha256
723+
// nolint
723724
func (s *Service) AsyncDownload(ctx context.Context, req *pbfs.AsyncDownloadReq) (*pbfs.AsyncDownloadResp, error) {
725+
startTime := time.Now()
724726
kit := kit.FromGrpcContext(ctx)
727+
rid := kit.Rid
728+
729+
logs.Infof("[AsyncDownload] start, rid: %s, biz_id: %d, app_id: %d, file: %s/%s",
730+
rid, req.BizId, req.FileMeta.ConfigItemAttachment.AppId,
731+
req.FileMeta.ConfigItemSpec.Path, req.FileMeta.ConfigItemSpec.Name)
725732

726733
// 鉴权
734+
authStart := time.Now()
727735
credential := getCredential(ctx)
728736
app, err := s.bll.AppCache().GetMeta(kit, req.BizId, req.FileMeta.ConfigItemAttachment.AppId)
737+
authDuration := time.Since(authStart)
729738
if err != nil {
739+
logs.Errorf("[AsyncDownload] get app metadata failed, rid: %s, biz_id: %d, app_id: %d, duration: %v, err: %v",
740+
rid, req.BizId, req.FileMeta.ConfigItemAttachment.AppId, authDuration, err)
730741
return nil, status.Errorf(codes.Aborted, "get app %d metadata failed, %s",
731742
req.FileMeta.ConfigItemAttachment.AppId, err.Error())
732743
}
744+
logs.Infof("[AsyncDownload] get app metadata success, rid: %s, biz_id: %d, app_id: %d, app_name: %s, duration: %v",
745+
rid, req.BizId, req.FileMeta.ConfigItemAttachment.AppId, app.Name, authDuration)
746+
733747
if !credential.MatchApp(app.Name) {
748+
logs.Errorf("[AsyncDownload] app permission denied, rid: %s, biz_id: %d, app_name: %s",
749+
rid, req.BizId, app.Name)
734750
return nil, status.Errorf(codes.PermissionDenied, "not have app %s permission", app.Name)
735751
}
736752

737753
req.FileMeta.ConfigItemSpec.Path = tools.ConvertBackslashes(req.FileMeta.ConfigItemSpec.Path)
738754

739755
if !credential.MatchConfigItem(app.Name, req.FileMeta.ConfigItemSpec.Path, req.FileMeta.ConfigItemSpec.Name) {
756+
logs.Errorf("[AsyncDownload] config item permission denied, rid: %s, biz_id: %d, app_name: %s, path: %s, name: %s",
757+
rid, req.BizId, app.Name, req.FileMeta.ConfigItemSpec.Path, req.FileMeta.ConfigItemSpec.Name)
740758
return nil, status.Error(codes.PermissionDenied, "no permission download file")
741759
}
742760

743761
gseConf := cc.FeedServer().GSE
744762
if !gseConf.Enabled {
763+
logs.Errorf("[AsyncDownload] p2p download disabled, rid: %s, biz_id: %d",
764+
rid, req.BizId)
745765
return nil, status.Error(codes.FailedPrecondition, "p2p download is disabled in server")
746766
}
747767

748768
// 获取客户端信息,check 是否支持 p2p 下载
769+
agentInfoStart := time.Now()
749770
clientAgentID, clientContainerID, err := s.getAsyncDownloadAgentInfo(ctx, req)
771+
agentInfoDuration := time.Since(agentInfoStart)
750772
if err != nil {
773+
logs.Errorf("[AsyncDownload] get agent info failed, rid: %s, biz_id: %d, duration: %v, err: %v",
774+
rid, req.BizId, agentInfoDuration, err)
751775
return nil, err
752776
}
777+
logs.Infof("[AsyncDownload] get agent info success, rid: %s, biz_id: %d, agent_id: %s, container_id: %s, duration: %v",
778+
rid, req.BizId, clientAgentID, clientContainerID, agentInfoDuration)
753779

754780
// 验证agentID是否属于指定的业务
781+
verifyStart := time.Now()
755782
if err = s.verifyAgentBelongsToBiz(
756783
ctx,
757784
clientAgentID,
758785
req.BizId,
759786
req.FileMeta.ConfigItemAttachment.AppId,
760787
app.Name,
761788
); err != nil {
789+
verifyDuration := time.Since(verifyStart)
790+
logs.Errorf("[AsyncDownload] verify agent belongs to biz failed, rid: %s, biz_id: %d, agent_id: %s, duration: %v, err: %v",
791+
rid, req.BizId, clientAgentID, verifyDuration, err)
762792
return nil, err
763793
}
794+
verifyDuration := time.Since(verifyStart)
795+
logs.Infof("[AsyncDownload] verify agent belongs to biz success, rid: %s, biz_id: %d, agent_id: %s, duration: %v",
796+
rid, req.BizId, clientAgentID, verifyDuration)
764797

765798
// 创建下载任务
799+
createTaskStart := time.Now()
766800
taskID, err := s.bll.AsyncDownload().CreateAsyncDownloadTask(kit, req.BizId,
767801
req.FileMeta.ConfigItemAttachment.AppId, req.FileMeta.ConfigItemSpec.Path, req.FileMeta.ConfigItemSpec.Name,
768802
clientAgentID, clientContainerID, req.FileMeta.ConfigItemSpec.Permission.User, req.FileDir,
769803
req.FileMeta.CommitSpec.Content.Signature)
804+
createTaskDuration := time.Since(createTaskStart)
770805
if err != nil {
806+
logs.Errorf("[AsyncDownload] create download task failed, rid: %s, biz_id: %d, app_id: %d, duration: %v, err: %v",
807+
rid, req.BizId, req.FileMeta.ConfigItemAttachment.AppId, createTaskDuration, err)
771808
return nil, err
772809
}
810+
logs.Infof("[AsyncDownload] create download task success, rid: %s, biz_id: %d, app_id: %d, task_id: %s, duration: %v",
811+
rid, req.BizId, req.FileMeta.ConfigItemAttachment.AppId, taskID, createTaskDuration)
773812

813+
totalDuration := time.Since(startTime)
774814
r := &pbfs.AsyncDownloadResp{
775815
TaskId: taskID,
776816
}
817+
logs.Infof("[AsyncDownload] completed, rid: %s, biz_id: %d, app_id: %d, task_id: %s, total_duration: %v",
818+
rid, req.BizId, req.FileMeta.ConfigItemAttachment.AppId, taskID, totalDuration)
777819
return r, nil
778820
}
779821

0 commit comments

Comments
 (0)