If there is a network error (corrupted data at a minimum, but I believe I'm also encountering it when servers suddenly become unavailable), there are data races in shutdown(). This can lead to panics or lockup in the client. In some cases, it panics with:
panic: assignment to entry in nil map
goroutine 1598 [running]:
github.com/SlyMarbo/spdy/spdy3.(*Conn).Request(0xc823460480, 0xc8228c41c0, 0x3001630, 0xc82347c240, 0x0, 0x0, 0x0, 0x0, 0x0)
/Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/requests.go:129 +0x18d9
github.com/SlyMarbo/spdy/spdy3.(*Conn).RequestResponse(0xc823460480, 0xc8228c41c0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0)
/Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/requests.go:139 +0x294
github.com/SlyMarbo/spdy.(*Transport).RoundTrip(0xc8228cf080, 0xc8228c41c0, 0x61e635, 0x0, 0x0)
/Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/transport.go:190 +0x539
net/http.send(0xc8228c41c0, 0x1564738, 0xc8228cf080, 0xf, 0x0, 0x0)
/usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:220 +0x73e
net/http.(*Client).send(0xc8232285a0, 0xc8228c41c0, 0xf, 0x0, 0x0)
/usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:143 +0x1f8
net/http.(*Client).doFollowingRedirects(0xc8232285a0, 0xc8228c41c0, 0x621a88, 0x0, 0x0, 0x0)
/usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:380 +0x105a
net/http.(*Client).Get(0xc8232285a0, 0x56b340, 0xf, 0x0, 0x0, 0x0)
/usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:306 +0xc9
main.main.func2()
/Users/rnapier/work/agent/src/kace/konea/cmd/kontap/client.go:29 +0x5c
created by main.main
/Users/rnapier/work/agent/src/kace/konea/cmd/kontap/client.go:42 +0x142
In other cases, the panic is caught by a recover, but this leads to a deadlock because the send
goroutine is no longer processing packets. So when the error handler tries to send a RST, the whole connection locks up. This can lead to further symptoms like Ping()
never returning (which is how I discovered this issue originally).
This block in shutdown seems the primary cause of data races:
s.output = nil
s.Request = nil
s.Receiver = nil
s.header = nil
s.stop = nil
Here are a few of the conflicts that popped up in my tests:
github.com/SlyMarbo/spdy/spdy3.(*Conn).send()
Attempt 26: /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:129 +0x5cc
github.com/SlyMarbo/spdy/spdy3.(*RequestStream).ReceiveFrame.func2()
/Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/request_stream.go:193 +0x44
github.com/SlyMarbo/spdy/spdy3.(*Conn).send()
/Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:142 +0xac6
github.com/SlyMarbo/spdy/spdy3.(*RequestStream).ReceiveFrame.func1()
/Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/request_stream.go:183 +0x7a
To demonstrate the problem, I use the following programs:
- server - A spdy server that returns "SUCCESS"
- client - A spdy client that connects very rapidly via a proxy and prints the output
- proxy - An unreliable proxy. It corrupts 1% of packets
The client and servers generally show data races immediately, and the client will generally crash within a few seconds. If you try to reuse the client for every iteration (which better matches my real use case), the Get
call will hang.
// server.go
// openssl req -x509 -newkey rsa:2048 -keyout key.pem -out cert.pem -days 365 -nodes -batch
package main
import (
"fmt"
"log"
"net/http"
"github.com/SlyMarbo/spdy"
)
func httpHandler(w http.ResponseWriter, req *http.Request) {
fmt.Fprintf(w, "SUCCESS")
}
func main() {
http.HandleFunc("/", httpHandler)
log.Printf("About to listen on 10443. Go to https://127.0.0.1:34567/")
err := spdy.ListenAndServeTLS(":10443", "cert.pem", "key.pem", nil)
if err != nil {
log.Fatal(err)
}
}
// client.go
package main
import (
"fmt"
"io/ioutil"
"time"
"github.com/SlyMarbo/spdy" // This adds SPDY support to net/http
)
func main() {
n := 0
client := spdy.NewClient(true) // In this configuration, the Get hangs w/o providing an error
for {
n++
go func(iter int) {
// client := spdy.NewClient(true) // Using a new client every time avoids the hang in Get, making crashes more common
fmt.Printf("Attempt %d: ", iter)
res, err := client.Get("https://:34567/") // 10443 for direct
fmt.Printf("...")
if err != nil {
fmt.Println(err)
return
}
body, err := ioutil.ReadAll(res.Body)
if err != nil {
fmt.Println("ERROR")
return
}
res.Body.Close()
fmt.Println(string(body))
}(n)
time.Sleep(10 * time.Millisecond)
}
}
// proxy.go
package main
import (
"fmt"
"io"
"log"
"math/rand"
"net"
"os"
)
func main() {
port := 34567
dest := 10443
rate := 0.01
l, err := net.Listen("tcp", fmt.Sprintf(":%d", port))
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
fmt.Printf("http://%v -> :%d\n", l.Addr(), dest)
for {
conn, err := l.Accept()
if err != nil {
log.Fatal(err)
}
fmt.Printf("Connection from %s\n", conn.RemoteAddr())
go proxy(conn, dest, rate)
}
}
func proxy(in net.Conn, dest int, rate float64) {
out, err := net.Dial("tcp", fmt.Sprintf(":%d", dest))
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
go unreliableCopy(out, in, rate)
unreliableCopy(in, out, rate)
fmt.Printf("Disconnect from %s\n", in.RemoteAddr())
}
func unreliableCopy(dst io.Writer, src io.Reader, rate float64) {
buf := make([]byte, 32*1024)
for {
nr, er := src.Read(buf)
if nr > 0 {
maybeCorrupt(buf[0:nr], rate)
nw, ew := dst.Write(buf[0:nr])
if ew != nil {
break
}
if nr != nw {
break
}
}
if er == io.EOF {
break
}
if er != nil {
break
}
}
}
func maybeCorrupt(buf []byte, rate float64) {
if rand.Float64() < rate {
fmt.Println("Corrupting")
loc := rand.Intn(len(buf))
val := byte(rand.Intn(0x100))
buf[loc] = val
}
}
I'm currently seeing this very often in the field, and my infrastructure is locking up usually within a few hours of usage. I have one client, and many servers that come and go, so network errors are reasonably common.