From a2e0dd829c55887201ed2e38808b85a9cf2163f6 Mon Sep 17 00:00:00 2001 From: Gusted Date: Thu, 2 Jan 2025 02:52:08 +0100 Subject: [PATCH 1/4] chore: avoid trying to stream data `profile.Parse` always call `io.ReadAll` so avoid the trouble and a goroutine and do it ourselves. Add some limited testing (testing the parsed stack is volatile and not really feasible). --- modules/process/manager_stacktraces.go | 15 ++-- modules/process/manager_stacktraces_test.go | 93 +++++++++++++++++++++ 2 files changed, 100 insertions(+), 8 deletions(-) create mode 100644 modules/process/manager_stacktraces_test.go diff --git a/modules/process/manager_stacktraces.go b/modules/process/manager_stacktraces.go index e260893113..a603e6a9f2 100644 --- a/modules/process/manager_stacktraces.go +++ b/modules/process/manager_stacktraces.go @@ -4,8 +4,8 @@ package process import ( + "bytes" "fmt" - "io" "runtime/pprof" "sort" "time" @@ -175,13 +175,12 @@ func (pm *Manager) ProcessStacktraces(flat, noSystem bool) ([]*Process, int, int // Now from within the lock we need to get the goroutines. // Why? If we release the lock then between between filling the above map and getting // the stacktraces another process could be created which would then look like a dead process below - reader, writer := io.Pipe() - defer reader.Close() - go func() { - err := pprof.Lookup("goroutine").WriteTo(writer, 0) - _ = writer.CloseWithError(err) - }() - stacks, err = profile.Parse(reader) + var buf bytes.Buffer + if err := pprof.Lookup("goroutine").WriteTo(&buf, 0); err != nil { + return nil, 0, 0, err + } + + stacks, err = profile.ParseData(buf.Bytes()) if err != nil { return nil, 0, 0, err } diff --git a/modules/process/manager_stacktraces_test.go b/modules/process/manager_stacktraces_test.go new file mode 100644 index 0000000000..d00fbc5ec8 --- /dev/null +++ b/modules/process/manager_stacktraces_test.go @@ -0,0 +1,93 @@ +// Copyright 2025 The Forgejo Authors. All rights reserved. +// SPDX-License-Identifier: GPL-3.0-or-later + +package process + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestProcessStacktraces(t *testing.T) { + _, _, finish := GetManager().AddContext(context.Background(), "Normal process") + defer finish() + parentCtx, _, finish := GetManager().AddContext(context.Background(), "Children normal process") + defer finish() + _, _, finish = GetManager().AddContext(parentCtx, "Children process") + defer finish() + _, _, finish = GetManager().AddTypedContext(context.Background(), "System process", SystemProcessType, true) + defer finish() + + t.Run("No flat with no system process", func(t *testing.T) { + processes, processCount, _, err := GetManager().ProcessStacktraces(false, true) + require.NoError(t, err) + assert.EqualValues(t, 4, processCount) + assert.Len(t, processes, 2) + + assert.EqualValues(t, "Children normal process", processes[0].Description) + assert.EqualValues(t, NormalProcessType, processes[0].Type) + assert.Empty(t, processes[0].ParentPID) + assert.Len(t, processes[0].Children, 1) + + assert.EqualValues(t, "Children process", processes[0].Children[0].Description) + assert.EqualValues(t, processes[0].PID, processes[0].Children[0].ParentPID) + + assert.EqualValues(t, "Normal process", processes[1].Description) + assert.EqualValues(t, NormalProcessType, processes[1].Type) + assert.Empty(t, processes[1].ParentPID) + assert.Empty(t, processes[1].Children) + }) + + t.Run("Flat with no system process", func(t *testing.T) { + processes, processCount, _, err := GetManager().ProcessStacktraces(true, true) + require.NoError(t, err) + assert.EqualValues(t, 4, processCount) + assert.Len(t, processes, 3) + + assert.EqualValues(t, "Children process", processes[0].Description) + assert.EqualValues(t, NormalProcessType, processes[0].Type) + assert.EqualValues(t, processes[1].PID, processes[0].ParentPID) + assert.Empty(t, processes[0].Children) + + assert.EqualValues(t, "Children normal process", processes[1].Description) + assert.EqualValues(t, NormalProcessType, processes[1].Type) + assert.Empty(t, processes[1].ParentPID) + assert.Empty(t, processes[1].Children) + + assert.EqualValues(t, "Normal process", processes[2].Description) + assert.EqualValues(t, NormalProcessType, processes[2].Type) + assert.Empty(t, processes[2].ParentPID) + assert.Empty(t, processes[2].Children) + }) + + t.Run("System process", func(t *testing.T) { + processes, processCount, _, err := GetManager().ProcessStacktraces(false, false) + require.NoError(t, err) + assert.EqualValues(t, 4, processCount) + assert.Len(t, processes, 4) + + assert.EqualValues(t, "System process", processes[0].Description) + assert.EqualValues(t, SystemProcessType, processes[0].Type) + assert.Empty(t, processes[0].ParentPID) + assert.Empty(t, processes[0].Children) + + assert.EqualValues(t, "Children normal process", processes[1].Description) + assert.EqualValues(t, NormalProcessType, processes[1].Type) + assert.Empty(t, processes[1].ParentPID) + assert.Len(t, processes[1].Children, 1) + + assert.EqualValues(t, "Normal process", processes[2].Description) + assert.EqualValues(t, NormalProcessType, processes[2].Type) + assert.Empty(t, processes[2].ParentPID) + assert.Empty(t, processes[2].Children) + + // This is the "main" pid, testing code always runs in a goroutine. + assert.EqualValues(t, "(unassociated)", processes[3].Description) + assert.EqualValues(t, NoneProcessType, processes[3].Type) + assert.Empty(t, processes[3].ParentPID) + assert.Empty(t, processes[3].Children) + }) +} From 3f44b97b5f39ad3e8dd923528e8acd28a6f073b3 Mon Sep 17 00:00:00 2001 From: Gusted Date: Thu, 2 Jan 2025 03:43:58 +0100 Subject: [PATCH 2/4] feat: add limited execution tracing support - For every process that is spawned (every new non-trivial goroutine such as http requests, queues or tasks) start a [execution tracer](https://pkg.go.dev/runtime/trace). This allows very precise diagnosis of how each individual process over a time period. - It's safe and [fast](https://go.dev/blog/execution-traces-2024#low-overhead-tracing) to be run in production, hence no setting to disable this. There's only noticable overhead when tracing is actually performed and not continuous. - Proper tracing support would mean the codebase would be full of `trace.WithRegion` and `trace.Log`, which feels premature for this patch as there's no real-world usage yet to indicate which places would need this the most. So far only Git commands and SQL queries receive somewhat proper tracing support given that these are used throughout the codebase. - Make git commands a new process type. - Add tracing to diagnosis zip file. --- models/db/engine.go | 22 +++++++++++++++++ modules/git/command.go | 6 +++-- modules/process/manager.go | 37 +++++++++++++++++++++-------- modules/process/process.go | 3 +++ routers/common/middleware.go | 3 +++ routers/web/admin/diagnosis.go | 33 +++++++++++++++---------- templates/admin/stacktrace-row.tmpl | 2 ++ 7 files changed, 82 insertions(+), 24 deletions(-) diff --git a/models/db/engine.go b/models/db/engine.go index 61649592e7..822618a7e3 100755 --- a/models/db/engine.go +++ b/models/db/engine.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "reflect" + "runtime/trace" "strings" "time" @@ -163,6 +164,8 @@ func InitEngine(ctx context.Context) error { Logger: errorLogger, }) + xormEngine.AddHook(&TracingHook{}) + SetDefaultEngine(ctx, xormEngine) return nil } @@ -318,6 +321,25 @@ func SetLogSQL(ctx context.Context, on bool) { } } +type TracingHook struct{} + +var _ contexts.Hook = &TracingHook{} + +type sqlTask struct{} + +func (TracingHook) BeforeProcess(c *contexts.ContextHook) (context.Context, error) { + ctx, task := trace.NewTask(c.Ctx, "sql") + ctx = context.WithValue(ctx, sqlTask{}, task) + trace.Log(ctx, "query", c.SQL) + trace.Logf(ctx, "args", "%v", c.Args) + return ctx, nil +} + +func (TracingHook) AfterProcess(c *contexts.ContextHook) error { + c.Ctx.Value(sqlTask{}).(*trace.Task).End() + return nil +} + type SlowQueryHook struct { Treshold time.Duration Logger log.Logger diff --git a/modules/git/command.go b/modules/git/command.go index a3d43aaec6..605816b7a2 100644 --- a/modules/git/command.go +++ b/modules/git/command.go @@ -13,6 +13,7 @@ import ( "os" "os/exec" "runtime" + "runtime/trace" "strings" "time" @@ -317,12 +318,13 @@ func (c *Command) Run(opts *RunOpts) error { var finished context.CancelFunc if opts.UseContextTimeout { - ctx, cancel, finished = process.GetManager().AddContext(c.parentContext, desc) + ctx, cancel, finished = process.GetManager().AddTypedContext(c.parentContext, desc, process.GitProcessType, true) } else { - ctx, cancel, finished = process.GetManager().AddContextTimeout(c.parentContext, timeout, desc) + ctx, cancel, finished = process.GetManager().AddTypedContextTimeout(c.parentContext, timeout, desc, process.GitProcessType, true) } defer finished() + trace.Log(ctx, "command", desc) startTime := time.Now() cmd := exec.CommandContext(ctx, c.prog, c.args...) diff --git a/modules/process/manager.go b/modules/process/manager.go index 37098ad92f..062ee1482f 100644 --- a/modules/process/manager.go +++ b/modules/process/manager.go @@ -7,6 +7,7 @@ package process import ( "context" "runtime/pprof" + "runtime/trace" "strconv" "sync" "sync/atomic" @@ -126,6 +127,27 @@ func (pm *Manager) AddTypedContext(parent context.Context, description, processT return ctx, cancel, finished } +// AddTypedContextTimeout creates a new context and adds it as a process. Once the process is finished, finished must be called +// to remove the process from the process table. It should not be called until the process is finished but must always be called. +// +// cancel should be used to cancel the returned context, however it will not remove the process from the process table. +// finished will cancel the returned context and remove it from the process table. +// +// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the +// process table. +func (pm *Manager) AddTypedContextTimeout(parent context.Context, timeout time.Duration, description, processType string, currentlyRunning bool) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) { + if timeout <= 0 { + // it's meaningless to use timeout <= 0, and it must be a bug! so we must panic here to tell developers to make the timeout correct + panic("the timeout must be greater than zero, otherwise the context will be cancelled immediately") + } + + ctx, cancel = context.WithTimeout(parent, timeout) + + ctx, _, finished = pm.Add(ctx, description, cancel, processType, currentlyRunning) + + return ctx, cancel, finished +} + // AddContextTimeout creates a new context and add it as a process. Once the process is finished, finished must be called // to remove the process from the process table. It should not be called until the process is finished but must always be called. // @@ -135,16 +157,7 @@ func (pm *Manager) AddTypedContext(parent context.Context, description, processT // Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the // process table. func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Duration, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) { - if timeout <= 0 { - // it's meaningless to use timeout <= 0, and it must be a bug! so we must panic here to tell developers to make the timeout correct - panic("the timeout must be greater than zero, otherwise the context will be cancelled immediately") - } - - ctx, cancel = context.WithTimeout(parent, timeout) - - ctx, _, finished = pm.Add(ctx, description, cancel, NormalProcessType, true) - - return ctx, cancel, finished + return pm.AddTypedContextTimeout(parent, timeout, description, NormalProcessType, true) } // Add create a new process @@ -159,6 +172,8 @@ func (pm *Manager) Add(ctx context.Context, description string, cancel context.C parentPID = "" } + ctx, traceTask := trace.NewTask(ctx, processType) + process := &process{ PID: pid, ParentPID: parentPID, @@ -166,6 +181,7 @@ func (pm *Manager) Add(ctx context.Context, description string, cancel context.C Start: start, Cancel: cancel, Type: processType, + TraceTrask: traceTask, } var finished FinishedFunc @@ -218,6 +234,7 @@ func (pm *Manager) nextPID() (start time.Time, pid IDType) { } func (pm *Manager) remove(process *process) { + process.TraceTrask.End() deleted := false pm.mutex.Lock() diff --git a/modules/process/process.go b/modules/process/process.go index 06a28c4a60..8947eb252f 100644 --- a/modules/process/process.go +++ b/modules/process/process.go @@ -5,12 +5,14 @@ package process import ( "context" + "runtime/trace" "time" ) var ( SystemProcessType = "system" RequestProcessType = "request" + GitProcessType = "git" NormalProcessType = "normal" NoneProcessType = "none" ) @@ -23,6 +25,7 @@ type process struct { Start time.Time Cancel context.CancelFunc Type string + TraceTrask *trace.Task } // ToProcess converts a process to a externally usable Process diff --git a/routers/common/middleware.go b/routers/common/middleware.go index 59e59b8d3f..ebc4d62d03 100644 --- a/routers/common/middleware.go +++ b/routers/common/middleware.go @@ -6,6 +6,7 @@ package common import ( "fmt" "net/http" + "runtime/trace" "strings" "code.gitea.io/gitea/modules/cache" @@ -43,6 +44,8 @@ func ProtocolMiddlewares() (handlers []any) { return http.HandlerFunc(func(resp http.ResponseWriter, req *http.Request) { ctx, _, finished := process.GetManager().AddTypedContext(req.Context(), fmt.Sprintf("%s: %s", req.Method, req.RequestURI), process.RequestProcessType, true) defer finished() + trace.Log(ctx, "method", req.Method) + trace.Log(ctx, "url", req.RequestURI) next.ServeHTTP(context.WrapResponseWriter(resp), req.WithContext(cache.WithCacheContext(ctx))) }) }) diff --git a/routers/web/admin/diagnosis.go b/routers/web/admin/diagnosis.go index 020554a35a..1c12259b06 100644 --- a/routers/web/admin/diagnosis.go +++ b/routers/web/admin/diagnosis.go @@ -7,6 +7,7 @@ import ( "archive/zip" "fmt" "runtime/pprof" + "runtime/trace" "time" "code.gitea.io/gitea/modules/httplib" @@ -15,17 +16,12 @@ import ( func MonitorDiagnosis(ctx *context.Context) { seconds := ctx.FormInt64("seconds") - if seconds <= 5 { - seconds = 5 - } - if seconds > 300 { - seconds = 300 - } + seconds = max(5, min(300, seconds)) httplib.ServeSetHeaders(ctx.Resp, &httplib.ServeHeaderOptions{ ContentType: "application/zip", Disposition: "attachment", - Filename: fmt.Sprintf("gitea-diagnosis-%s.zip", time.Now().Format("20060102-150405")), + Filename: fmt.Sprintf("forgejo-diagnosis-%s.zip", time.Now().Format("20060102-150405")), }) zipWriter := zip.NewWriter(ctx.Resp) @@ -44,14 +40,27 @@ func MonitorDiagnosis(ctx *context.Context) { return } - err = pprof.StartCPUProfile(f) - if err == nil { - time.Sleep(time.Duration(seconds) * time.Second) - pprof.StopCPUProfile() - } else { + if err := pprof.StartCPUProfile(f); err != nil { _, _ = f.Write([]byte(err.Error())) } + f, err = zipWriter.CreateHeader(&zip.FileHeader{Name: "trace.dat", Method: zip.Deflate, Modified: time.Now()}) + if err != nil { + ctx.ServerError("Failed to create zip file", err) + return + } + + if err := trace.Start(f); err != nil { + _, _ = f.Write([]byte(err.Error())) + } + + select { + case <-time.After(time.Duration(seconds) * time.Second): + case <-ctx.Done(): + } + pprof.StopCPUProfile() + trace.Stop() + f, err = zipWriter.CreateHeader(&zip.FileHeader{Name: "goroutine-after.txt", Method: zip.Deflate, Modified: time.Now()}) if err != nil { ctx.ServerError("Failed to create zip file", err) diff --git a/templates/admin/stacktrace-row.tmpl b/templates/admin/stacktrace-row.tmpl index 97c361ff90..048056cf4e 100644 --- a/templates/admin/stacktrace-row.tmpl +++ b/templates/admin/stacktrace-row.tmpl @@ -7,6 +7,8 @@ {{svg "octicon-cpu" 16}} {{else if eq .Process.Type "normal"}} {{svg "octicon-terminal" 16}} + {{else if eq .Process.Type "git"}} + {{svg "octicon-git-branch" 16}} {{else}} {{svg "octicon-code" 16}} {{end}} From ebe6ebe3f3e3d1bc071ed52ede2279ea0d04c3ca Mon Sep 17 00:00:00 2001 From: Gusted Date: Fri, 3 Jan 2025 09:33:09 +0100 Subject: [PATCH 3/4] feat: Run garbage collection before profiling heap - This avoids returning that some memory is allocated for something that's actually no longer in used. Go's standard library also does this for testing and benchmarking when returning memory profiles. A canonical link that this recommended is the example "To add equivalent profiling support to a standalone program" in https://pkg.go.dev/runtime/pprof --- routers/web/admin/diagnosis.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/routers/web/admin/diagnosis.go b/routers/web/admin/diagnosis.go index 1c12259b06..959c9bc444 100644 --- a/routers/web/admin/diagnosis.go +++ b/routers/web/admin/diagnosis.go @@ -6,6 +6,7 @@ package admin import ( "archive/zip" "fmt" + "runtime" "runtime/pprof" "runtime/trace" "time" @@ -73,5 +74,8 @@ func MonitorDiagnosis(ctx *context.Context) { ctx.ServerError("Failed to create zip file", err) return } + // To avoid showing memory that actually can be cleaned, run the garbage + // collector. + runtime.GC() _ = pprof.Lookup("heap").WriteTo(f, 0) } From ba8ebcbd68f190ed72b325e88c4c5799d9d0c225 Mon Sep 17 00:00:00 2001 From: Gusted Date: Sat, 4 Jan 2025 05:09:00 +0100 Subject: [PATCH 4/4] feat: improve 'download diagnosis report' ui - Use `