feat: add limited execution tracing support

- For every process that is spawned (every new non-trivial goroutine
such as http requests, queues or tasks) start a [execution
tracer](https://pkg.go.dev/runtime/trace). This allows very precise
diagnosis of how each individual process over a time period.
- It's safe and [fast](https://go.dev/blog/execution-traces-2024#low-overhead-tracing) to
be run in production, hence no setting to disable this. There's only
noticable overhead when tracing is actually performed and not continuous.
- Proper tracing support would mean the codebase would be full of
`trace.WithRegion` and `trace.Log`, which feels premature for this patch
as there's no real-world usage yet to indicate which places would need
this the most. So far only Git commands and SQL queries receive somewhat
proper tracing support given that these are used throughout the codebase.
- Make git commands a new process type.
- Add tracing to diagnosis zip file.
This commit is contained in:
Gusted 2025-01-02 03:43:58 +01:00
parent a2e0dd829c
commit 3f44b97b5f
No known key found for this signature in database
GPG key ID: FD821B732837125F
7 changed files with 82 additions and 24 deletions

View file

@ -11,6 +11,7 @@ import (
"fmt"
"io"
"reflect"
"runtime/trace"
"strings"
"time"
@ -163,6 +164,8 @@ func InitEngine(ctx context.Context) error {
Logger: errorLogger,
})
xormEngine.AddHook(&TracingHook{})
SetDefaultEngine(ctx, xormEngine)
return nil
}
@ -318,6 +321,25 @@ func SetLogSQL(ctx context.Context, on bool) {
}
}
type TracingHook struct{}
var _ contexts.Hook = &TracingHook{}
type sqlTask struct{}
func (TracingHook) BeforeProcess(c *contexts.ContextHook) (context.Context, error) {
ctx, task := trace.NewTask(c.Ctx, "sql")
ctx = context.WithValue(ctx, sqlTask{}, task)
trace.Log(ctx, "query", c.SQL)
trace.Logf(ctx, "args", "%v", c.Args)
return ctx, nil
}
func (TracingHook) AfterProcess(c *contexts.ContextHook) error {
c.Ctx.Value(sqlTask{}).(*trace.Task).End()
return nil
}
type SlowQueryHook struct {
Treshold time.Duration
Logger log.Logger

View file

@ -13,6 +13,7 @@ import (
"os"
"os/exec"
"runtime"
"runtime/trace"
"strings"
"time"
@ -317,12 +318,13 @@ func (c *Command) Run(opts *RunOpts) error {
var finished context.CancelFunc
if opts.UseContextTimeout {
ctx, cancel, finished = process.GetManager().AddContext(c.parentContext, desc)
ctx, cancel, finished = process.GetManager().AddTypedContext(c.parentContext, desc, process.GitProcessType, true)
} else {
ctx, cancel, finished = process.GetManager().AddContextTimeout(c.parentContext, timeout, desc)
ctx, cancel, finished = process.GetManager().AddTypedContextTimeout(c.parentContext, timeout, desc, process.GitProcessType, true)
}
defer finished()
trace.Log(ctx, "command", desc)
startTime := time.Now()
cmd := exec.CommandContext(ctx, c.prog, c.args...)

View file

@ -7,6 +7,7 @@ package process
import (
"context"
"runtime/pprof"
"runtime/trace"
"strconv"
"sync"
"sync/atomic"
@ -126,6 +127,27 @@ func (pm *Manager) AddTypedContext(parent context.Context, description, processT
return ctx, cancel, finished
}
// AddTypedContextTimeout creates a new context and adds it as a process. Once the process is finished, finished must be called
// to remove the process from the process table. It should not be called until the process is finished but must always be called.
//
// cancel should be used to cancel the returned context, however it will not remove the process from the process table.
// finished will cancel the returned context and remove it from the process table.
//
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
// process table.
func (pm *Manager) AddTypedContextTimeout(parent context.Context, timeout time.Duration, description, processType string, currentlyRunning bool) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
if timeout <= 0 {
// it's meaningless to use timeout <= 0, and it must be a bug! so we must panic here to tell developers to make the timeout correct
panic("the timeout must be greater than zero, otherwise the context will be cancelled immediately")
}
ctx, cancel = context.WithTimeout(parent, timeout)
ctx, _, finished = pm.Add(ctx, description, cancel, processType, currentlyRunning)
return ctx, cancel, finished
}
// AddContextTimeout creates a new context and add it as a process. Once the process is finished, finished must be called
// to remove the process from the process table. It should not be called until the process is finished but must always be called.
//
@ -135,16 +157,7 @@ func (pm *Manager) AddTypedContext(parent context.Context, description, processT
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
// process table.
func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Duration, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
if timeout <= 0 {
// it's meaningless to use timeout <= 0, and it must be a bug! so we must panic here to tell developers to make the timeout correct
panic("the timeout must be greater than zero, otherwise the context will be cancelled immediately")
}
ctx, cancel = context.WithTimeout(parent, timeout)
ctx, _, finished = pm.Add(ctx, description, cancel, NormalProcessType, true)
return ctx, cancel, finished
return pm.AddTypedContextTimeout(parent, timeout, description, NormalProcessType, true)
}
// Add create a new process
@ -159,6 +172,8 @@ func (pm *Manager) Add(ctx context.Context, description string, cancel context.C
parentPID = ""
}
ctx, traceTask := trace.NewTask(ctx, processType)
process := &process{
PID: pid,
ParentPID: parentPID,
@ -166,6 +181,7 @@ func (pm *Manager) Add(ctx context.Context, description string, cancel context.C
Start: start,
Cancel: cancel,
Type: processType,
TraceTrask: traceTask,
}
var finished FinishedFunc
@ -218,6 +234,7 @@ func (pm *Manager) nextPID() (start time.Time, pid IDType) {
}
func (pm *Manager) remove(process *process) {
process.TraceTrask.End()
deleted := false
pm.mutex.Lock()

View file

@ -5,12 +5,14 @@ package process
import (
"context"
"runtime/trace"
"time"
)
var (
SystemProcessType = "system"
RequestProcessType = "request"
GitProcessType = "git"
NormalProcessType = "normal"
NoneProcessType = "none"
)
@ -23,6 +25,7 @@ type process struct {
Start time.Time
Cancel context.CancelFunc
Type string
TraceTrask *trace.Task
}
// ToProcess converts a process to a externally usable Process

View file

@ -6,6 +6,7 @@ package common
import (
"fmt"
"net/http"
"runtime/trace"
"strings"
"code.gitea.io/gitea/modules/cache"
@ -43,6 +44,8 @@ func ProtocolMiddlewares() (handlers []any) {
return http.HandlerFunc(func(resp http.ResponseWriter, req *http.Request) {
ctx, _, finished := process.GetManager().AddTypedContext(req.Context(), fmt.Sprintf("%s: %s", req.Method, req.RequestURI), process.RequestProcessType, true)
defer finished()
trace.Log(ctx, "method", req.Method)
trace.Log(ctx, "url", req.RequestURI)
next.ServeHTTP(context.WrapResponseWriter(resp), req.WithContext(cache.WithCacheContext(ctx)))
})
})

View file

@ -7,6 +7,7 @@ import (
"archive/zip"
"fmt"
"runtime/pprof"
"runtime/trace"
"time"
"code.gitea.io/gitea/modules/httplib"
@ -15,17 +16,12 @@ import (
func MonitorDiagnosis(ctx *context.Context) {
seconds := ctx.FormInt64("seconds")
if seconds <= 5 {
seconds = 5
}
if seconds > 300 {
seconds = 300
}
seconds = max(5, min(300, seconds))
httplib.ServeSetHeaders(ctx.Resp, &httplib.ServeHeaderOptions{
ContentType: "application/zip",
Disposition: "attachment",
Filename: fmt.Sprintf("gitea-diagnosis-%s.zip", time.Now().Format("20060102-150405")),
Filename: fmt.Sprintf("forgejo-diagnosis-%s.zip", time.Now().Format("20060102-150405")),
})
zipWriter := zip.NewWriter(ctx.Resp)
@ -44,14 +40,27 @@ func MonitorDiagnosis(ctx *context.Context) {
return
}
err = pprof.StartCPUProfile(f)
if err == nil {
time.Sleep(time.Duration(seconds) * time.Second)
pprof.StopCPUProfile()
} else {
if err := pprof.StartCPUProfile(f); err != nil {
_, _ = f.Write([]byte(err.Error()))
}
f, err = zipWriter.CreateHeader(&zip.FileHeader{Name: "trace.dat", Method: zip.Deflate, Modified: time.Now()})
if err != nil {
ctx.ServerError("Failed to create zip file", err)
return
}
if err := trace.Start(f); err != nil {
_, _ = f.Write([]byte(err.Error()))
}
select {
case <-time.After(time.Duration(seconds) * time.Second):
case <-ctx.Done():
}
pprof.StopCPUProfile()
trace.Stop()
f, err = zipWriter.CreateHeader(&zip.FileHeader{Name: "goroutine-after.txt", Method: zip.Deflate, Modified: time.Now()})
if err != nil {
ctx.ServerError("Failed to create zip file", err)

View file

@ -7,6 +7,8 @@
{{svg "octicon-cpu" 16}}
{{else if eq .Process.Type "normal"}}
{{svg "octicon-terminal" 16}}
{{else if eq .Process.Type "git"}}
{{svg "octicon-git-branch" 16}}
{{else}}
{{svg "octicon-code" 16}}
{{end}}