0
votes

Go env:

GOARCH="amd64"

GOBIN="/root/"

GOEXE=""

GOHOSTARCH="amd64"

GOHOSTOS="linux"

GOOS="linux"

GOPATH="/data/workspace/kubernetes"

GORACE=""

GOROOT="/usr/local/go"

GOTOOLDIR="/usr/local/go/pkg/tool/linux_amd64"

GO15VENDOREXPERIMENT="1"

CC="gcc"

GOGCCFLAGS="-fPIC -m64 -pthread -fmessage-length=0"

CXX="g++"

CGO_ENABLED="1"

Go version:

go version go1.6.3 linux/amd64

This issues is happend on a “performance test env” kube-apiserver with high load. kube-apiserver panic and exit:

fatal error: concurrent map read and map write

goroutine 77930636 [running]:
runtime.throw(0x2f4c4c0, 0x21)
    /root/.gvm/gos/go1.6.3/src/runtime/panic.go:547 +0x90 fp=0xca67b477f0     sp=0xca67b477d8
runtime.mapaccess1_faststr(0x2a8e520, 0xc9e29000f0, 0x2c11220, 0xa, 0x433e360)
    /root/.gvm/gos/go1.6.3/src/runtime/hashmap_fast.go:202 +0x5b fp=0xca67b47850 sp=0xca67b477f0
k8s.io/kubernetes/pkg/httplog.(*respLogger).Log(0xcbddf2ae70)
       /data/gerrit/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/httplog/log.go:180 +0x43d fp=0xca67b47af8 sp=0xca67b47850
k8s.io/kubernetes/pkg/apiserver.RecoverPanics.func1(0x7f099f157090, 0xcbddf2ae70, 0xcd7569e380)
     /data/gerrit/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/apiserver/handlers.go:174 +0x15d fp=0xca67b47b50 sp=0xca67b47af8
    net/http.HandlerFunc.ServeHTTP(0xc821a4eac0, 0x7f099f157058, 0xca0f4eb450, 0xcd7569e380)
    /root/.gvm/gos/go1.6.3/src/net/http/server.go:1618 +0x3a fp=0xca67b47b70 sp=0xca67b47b50
net/http.serverHandler.ServeHTTP(0xc8215a7b80, 0x7f099f157058, 0xca0f4eb450, 0xcd7569e380)
    /root/.gvm/gos/go1.6.3/src/net/http/server.go:2081 +0x19e fp=0xca67b47bd0 sp=0xca67b47b70
net/http.(*conn).serve(0xc8b5d6b980)
    /root/.gvm/gos/go1.6.3/src/net/http/server.go:1472 +0xf2e fp=0xca67b47f98 sp=0xca67b47bd0
runtime.goexit()
    /root/.gvm/gos/go1.6.3/src/runtime/asm_amd64.s:1998 +0x1 fp=0xca67b47fa0 sp=0xca67b47f98
created by net/http.(*Server).Serve
    /root/.gvm/gos/go1.6.3/src/net/http/server.go:2137 +0x44e

corresponding source code:

pkg/apiserver/handlers.go

145 func RecoverPanics(handler http.Handler) http.Handler {
146 return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
147 defer func() {
148     if x := recover(); x != nil {
149         http.Error(w, "apis panic. Look in log for details.", http.StatusInternalServerError)
150         glog.Errorf("APIServer panic'd on %v %v: %v\n%s\n", req.Method, req.RequestURI, x, debug.Stack())
151     }
152 }()
153 defer httplog.NewLogged(req, &w).StacktraceWhen(
        httplog.StatusIsNot(
            http.StatusOK,
            http.StatusCreated,
            http.StatusAccepted,
            http.StatusBadRequest,
            http.StatusMovedPermanently,
            http.StatusTemporaryRedirect,
            http.StatusConflict,
            http.StatusNotFound,
            http.StatusUnauthorized,
            http.StatusForbidden,
            errors.StatusUnprocessableEntity,
            http.StatusSwitchingProtocols,
            http.StatusRequestTimeout,
            errors.StatusTooManyRequests,
        ),
170 ).Log()

    // Dispatch to the internal handler
    handler.ServeHTTP(w, req)
174 })
}

pkg/httplog/log.go:

159 func (rl *respLogger) Log() {
160 latency := time.Since(rl.startTime)
161 if glog.V(2) {
162     extraInfo := ""
163     if latency >= time.Millisecond*200 && latency < time.Second {
        extraInfo = fmt.Sprintf("%d00.Millisecond", latency/(time.Millisecond*100))
    } else if latency >= time.Second && latency < time.Minute { // Warning
        extraInfo = fmt.Sprintf("%d.Second", latency/(time.Second))
    } else if latency >= time.Minute { // nce will timeout
        extraInfo = fmt.Sprintf("%d.Minutes", latency/(time.Minute))
    }
    method := rl.req.Method
    if len(rl.req.Header["Detailed-Method"]) > 0 {
        method = rl.req.Header["Detailed-Method"][0]
    }

    remoteIP := rl.getXForwardIPAdress(rl.req)


    if !rl.hijacked {
        //glog.InfoDepth(1, fmt.Sprintf("%s %s: (%v) %v%v%v [%s %s]", rl.req.Method, rl.req.RequestURI, latency, rl.status, rl.statusStack, rl.addedInfo, rl.req.Header["User-Agent"], rl.req.RemoteAddr))
180         glog.InfoDepth(1, fmt.Sprintf("%v %s %s: (%sms) %v%v [%s %s]-%s %v", rl.req.Header["X-Requestid"], method, rl.req.RequestURI, GetMilliLatency(latency), rl.status, rl.addedInfo, rl.req.Header["User-Agent"], remoteIP, extraInfo, rl.statusStack))
    } else {
        //glog.InfoDepth(1, fmt.Sprintf("%s %s: (%v) hijacked [%s %s]", rl.req.Method, rl.req.RequestURI, latency, rl.req.Header["User-Agent"], rl.req.RemoteAddr))
        glog.InfoDepth(1, fmt.Sprintf("%v %s %s: (%sms) hijacked [%s %s]-%s", rl.req.Header["X-Requestid"], method, rl.req.RequestURI, GetMilliLatency(latency), rl.req.Header["User-Agent"], remoteIP, extraInfo))
    }
}

}

In handler.ServeHTTP, I changed request.Header. I cann't find the reason:

(1) "serveHTTP" and "defer log" works in a serial manner. So not exist "concurrent read and write"

(2) Even if exist "concurrent read and write", there exist "recover" func to deal with panics, so the kube-apiserver shoud not exit.

The question has cost me lots of time, who can help me? Thanks

1
will race-detector help? blog.golang.org/race-detectorJiang YD
Thanks for your help. I build kube-apiserver with -race option and then run it, nothing happened, there isn't "DATA RACE" bla bla.Bling
“Because of its design, the race detector can detect race conditions only when they are actually triggered by running code, which means it's important to run race-enabled binaries under realistic workloads.” Sorry to note that just now. That panic just happened once in our "performance test env" and not easily to reproduce.Bling
I guess there is some bug in glog implementation.Jiang YD
And a fatal error is not a panic. panic controls the flow, but fatal make executable crash.Jiang YD

1 Answers

0
votes

My guess is that you've stepped on the fact that the global timeout handler can't actually stop the goroutines working on the request. So when the connection times out (1m unless you've changed it), it returns an error. If the goroutine actually completes at the same time, it could also attempt to write to the connection. I thought we had added a lock to prevent this from causing a problem, but perhaps the headers are not always protected by the lock. If you can reproduce this in a clean source tree, then please file an issue at the Kubernetes github repo.